In [259]:
import pandas as pd
import numpy as np

# Import Regular Expression
import re

# Loading the data
df = pd.read_csv("Data/jumia_mobile_phones.csv")

# Display the first few rows of the data to get a sense of the data structure
df.head()

Unnamed: 0,Name,Price,Old Price,Discount,Rating
0,"XIAOMI Redmi A3, 6.71"", 3GB RAM + 64GB (Dual S...","KSh 8,999","KSh 11,000",18%,4.1 out of 5
1,"Tecno Spark 20, Android 13, 6.6"", 128GB + 4GB ...","KSh 12,925","KSh 15,000",14%,4.4 out of 5
2,"Itel S23 6.6"", 128GB + 4GB RAM, 50MP Camera, (...","KSh 8,940","KSh 10,000",11%,4.3 out of 5
3,"Itel S23 6.6"", 128GB + 4GB RAM, 50MP Camera, (...","KSh 8,940","KSh 10,000",11%,4.1 out of 5
4,"Samsung Galaxy A05, 6.7'' 4GB RAM + 128GB ROM ...","KSh 12,930","KSh 14,000",8%,4.6 out of 5


In [260]:
# Get all the entries in the name column and assign them to a list called data
data = df['Name']

# convert the data type of data to a list
data = data.tolist()

# Check the data type of data
type(data)

list

In [261]:
# Investigating the list of names
data

['XIAOMI Redmi A3, 6.71", 3GB RAM + 64GB (Dual SIM), 5000mAh, Midnight Black (1YR WRTY)',
 'Tecno Spark 20, Android 13, 6.6", 128GB + 4GB RAM(4GB Extended), 50MP, 5000mAh, Gravity Black',
 'Itel S23 6.6", 128GB + 4GB RAM, 50MP Camera, (Dual SIM), 4G, 5000mAh - Starry Black',
 'Itel S23 6.6", 128GB + 4GB RAM, 50MP Camera, (Dual SIM), 4G, 5000mAh - Mystery White (1YR WRTY)',
 "Samsung Galaxy A05, 6.7'' 4GB RAM + 128GB ROM (Dual Sim) 50MP Camera, 5000mAh, Black (1YR WRTY)",
 "Samsung GALAXY A15, 6.5'' HD+, 4GB RAM + 128GB ROM, DUAL SIM, 50MP,  5000mAh - Black",
 "Samsung Galaxy A05, 6.7'' 4GB RAM + 128GB ROM (Dual Sim) 50MP, 5000mAh - Black + Smart Watch & Buds",
 'Infinix Smart 8 6.6" HD, 2GB RAM + 64GB , Android 13 (Dual sim) 5000mAh - Timber Black',
 'XIAOMI Redmi 13C, 6.74"  8GB RAM + 256GB ROM, 50MP AI Triple Camera, 5000mAh (Dual Sim) - Midnight Black + Free Gifts',
 'Itel S23 6.6", 128GB + 4GB RAM, 50MP Camera, (Dual SIM), 4G, 5000mAh - Luxurious Gold',
 "Tecno Spark 20, 6.6'' HD+,

Here we can clearly see that there is so much cleaning to be done. Let us do the basic cleaning by defining a function.

In [262]:
def clean_data(data):
    cleaned_data = []
    
    for entry in data:
        # Remove unwanted characters, keeping numbers and spaces
        cleaned_entry = re.sub(r"[,'+\-]", " ", entry)  # Replace commas, single quotes, and plus signs with space
        cleaned_entry = re.sub(r"\s+", " ", cleaned_entry)  # Replace multiple spaces with a single space
        cleaned_entry = cleaned_entry.strip()  # Remove leading and trailing spaces
        cleaned_data.append(cleaned_entry)
    
    return cleaned_data

# Clean the data
cleaned_data = clean_data(data)

cleaned_data

['XIAOMI Redmi A3 6.71" 3GB RAM 64GB (Dual SIM) 5000mAh Midnight Black (1YR WRTY)',
 'Tecno Spark 20 Android 13 6.6" 128GB 4GB RAM(4GB Extended) 50MP 5000mAh Gravity Black',
 'Itel S23 6.6" 128GB 4GB RAM 50MP Camera (Dual SIM) 4G 5000mAh Starry Black',
 'Itel S23 6.6" 128GB 4GB RAM 50MP Camera (Dual SIM) 4G 5000mAh Mystery White (1YR WRTY)',
 'Samsung Galaxy A05 6.7 4GB RAM 128GB ROM (Dual Sim) 50MP Camera 5000mAh Black (1YR WRTY)',
 'Samsung GALAXY A15 6.5 HD 4GB RAM 128GB ROM DUAL SIM 50MP 5000mAh Black',
 'Samsung Galaxy A05 6.7 4GB RAM 128GB ROM (Dual Sim) 50MP 5000mAh Black Smart Watch & Buds',
 'Infinix Smart 8 6.6" HD 2GB RAM 64GB Android 13 (Dual sim) 5000mAh Timber Black',
 'XIAOMI Redmi 13C 6.74" 8GB RAM 256GB ROM 50MP AI Triple Camera 5000mAh (Dual Sim) Midnight Black Free Gifts',
 'Itel S23 6.6" 128GB 4GB RAM 50MP Camera (Dual SIM) 4G 5000mAh Luxurious Gold',
 'Tecno Spark 20 6.6 HD UP to 8GB RAM 128GB ROM (Dual Sim) 50MP 5000 mAh Cyber White',
 'Tecno Spark 20C 6.6 256GB H

Great looks like we are off to a good start now that we have done the preliminary cleaning. Upon close inspection, we can see that the naming pattern between Samsung phones and other brands are quite different. Therefore it is crucial that we separate the entries in list into Samsung phones and Other phones. Unless we do that, it would be a nightmare trying to extract the infos we are looking for. Here we go

In [263]:
# Separate Samsung entries from other brands
samsung_phones = [spec for spec in data if "Samsung" in spec]
other_phones = [spec for spec in data if "Samsung" not in spec]

# Let us check the result
samsung_phones, other_phones

(["Samsung Galaxy A05, 6.7'' 4GB RAM + 128GB ROM (Dual Sim) 50MP Camera, 5000mAh, Black (1YR WRTY)",
  "Samsung GALAXY A15, 6.5'' HD+, 4GB RAM + 128GB ROM, DUAL SIM, 50MP,  5000mAh - Black",
  "Samsung Galaxy A05, 6.7'' 4GB RAM + 128GB ROM (Dual Sim) 50MP, 5000mAh - Black + Smart Watch & Buds",
  "Samsung Galaxy A15, 6.5'' Display, 4GB RAM + 128GB ROM (Dual Sim) 50MP,  5000mAh - Blue(2YRs WRTY)",
  'Samsung Galaxy A05s, 6.7", 64GB + 4GB (Dual SIM), 5000mAh, Black',
  'Samsung  Galaxy A05S, 6.7", 90HZ, 4GB RAM + 128GB ROM, 50MP, 5000mAh - Black',
  "Samsung Galaxy A05, 6.7''  4GB RAM + 64GB ROM (Dual Sim) 50MP Camera, 5000mAh - Black",
  "Samsung GALAXY A15, 6.5'', 4GB RAM + 128GB ROM, DUAL SIM, 50MP, 5000mAh - Blue + Smart watch & Power Bank",
  "Samsung Galaxy A15, 6.5'' Display, 6GB RAM + 128GB ROM (Dual Sim) 50MP,  5000mAh - Blue + Smart watch & Power bank",
  'Samsung  Galaxy A05S, 6.7", 90HZ, 4GB RAM + 128GB ROM, 50MP, 5000mAh - Silver',
  "Samsung Galaxy A05, 6.7'' 4GB RAM + 128G

In [264]:

# Further cleaning to remove mathematical operators within the list
def clean_entries(entries):
    cleaned_entries = []
    for entry in entries:
        # Remove mathematical operators
        cleaned_entry = re.sub(r'[+\-]', '', entry)
        cleaned_entries.append(cleaned_entry)
    return cleaned_entries

# Clean the samsung phones list
samsung_phones = clean_entries(samsung_phones)


# Display the cleaned Samsung phone specifications
samsung_phones

["Samsung Galaxy A05, 6.7'' 4GB RAM  128GB ROM (Dual Sim) 50MP Camera, 5000mAh, Black (1YR WRTY)",
 "Samsung GALAXY A15, 6.5'' HD, 4GB RAM  128GB ROM, DUAL SIM, 50MP,  5000mAh  Black",
 "Samsung Galaxy A05, 6.7'' 4GB RAM  128GB ROM (Dual Sim) 50MP, 5000mAh  Black  Smart Watch & Buds",
 "Samsung Galaxy A15, 6.5'' Display, 4GB RAM  128GB ROM (Dual Sim) 50MP,  5000mAh  Blue(2YRs WRTY)",
 'Samsung Galaxy A05s, 6.7", 64GB  4GB (Dual SIM), 5000mAh, Black',
 'Samsung  Galaxy A05S, 6.7", 90HZ, 4GB RAM  128GB ROM, 50MP, 5000mAh  Black',
 "Samsung Galaxy A05, 6.7''  4GB RAM  64GB ROM (Dual Sim) 50MP Camera, 5000mAh  Black",
 "Samsung GALAXY A15, 6.5'', 4GB RAM  128GB ROM, DUAL SIM, 50MP, 5000mAh  Blue  Smart watch & Power Bank",
 "Samsung Galaxy A15, 6.5'' Display, 6GB RAM  128GB ROM (Dual Sim) 50MP,  5000mAh  Blue  Smart watch & Power bank",
 'Samsung  Galaxy A05S, 6.7", 90HZ, 4GB RAM  128GB ROM, 50MP, 5000mAh  Silver',
 "Samsung Galaxy A05, 6.7'' 4GB RAM  128GB ROM (Dual Sim) 50MP Camera, 5000

Ok now that our data is nicely cleaned and separated, let us start the process of extracting the phones' features from the list. Regular Expression comes very handy for this process. 

In [265]:

# Feature extraction from the samsung phones using regular expressions
def extract_samsung_info(spec):
    # Regular expressions for extracting relevant parts
    name_pattern = r'^(Samsung\s+Galaxy\s+\w+\s*\w*)'  # Without the case-insensitivity flag
    display_pattern = r'(\d+(\.\d+)?)\s*["\']'
    ram_pattern = r'(\d+)GB RAM'
    storage_pattern = r'(\d+GB)\s*ROM'
    camera_pattern = r'(\d+MP)'
    battery_pattern = r'(\d{4}mAh)'
    sim_pattern = r'\(?(Dual\s+Sim|DUAL\s+SIM|Single\s+Sim|Single)\)?'
    
    # Extract all the numbers that are followed by GB
    gb_values = [int(m.group(1)) for m in re.finditer(r'(\d+)GB', spec)]

    # Determine RAM and Storage based on size
    ram = min(gb_values) if gb_values else None
    storage = max(gb_values) if gb_values else None

    # Updated color pattern to ensure it captures correctly
    color_pattern = r'(?i)[-\s](Black|White|Blue|Red|Green|Yellow|Gold|Silver|Gray|Pink)(?=\s|\(|$)'
    color_match = re.search(color_pattern, spec)
    color = color_match.group(1).strip() if color_match else None

    # Set a default display size if not found
    display_size = re.search(display_pattern, spec).group(1) if re.search(display_pattern, spec) else "6.0"

    # Use case-insensitive search for name
    name_match = re.search(name_pattern, spec, re.IGNORECASE)
    
    return {
        "Name": name_match.group(0).strip() if name_match else None,
        "Display Size": display_size,
        "RAM": ram,
        "Storage": storage,
        "Camera": re.search(camera_pattern, spec).group(1) if re.search(camera_pattern, spec) else None,
        "Battery": re.search(battery_pattern, spec).group(1) if re.search(battery_pattern, spec) else None,
        "Color": color,
        "Warranty": "Yes" if "WRTY" in spec else "No",
        "Number of SIMs": 2 if re.search(sim_pattern, spec) and "Dual" in re.search(sim_pattern, spec).group(0) else 1
    }

# Extract information for Samsung phones
samsung_phones_data = [extract_samsung_info(spec) for spec in samsung_phones]

# Create a DataFrame
samsung_phones_df = pd.DataFrame(samsung_phones_data)

# Display the DataFrame
print("Samsung Phones:")
samsung_phones_df.head(10)

Samsung Phones:


Unnamed: 0,Name,Display Size,RAM,Storage,Camera,Battery,Color,Warranty,Number of SIMs
0,Samsung Galaxy A05,6.7,4,128,50MP,5000mAh,Black,Yes,2
1,Samsung GALAXY A15,6.5,4,128,50MP,5000mAh,Black,No,1
2,Samsung Galaxy A05,6.7,4,128,50MP,5000mAh,Black,No,2
3,Samsung Galaxy A15,6.5,4,128,50MP,5000mAh,Blue,Yes,2
4,Samsung Galaxy A05s,6.7,4,64,,5000mAh,Black,No,1
5,Samsung Galaxy A05S,6.7,4,128,50MP,5000mAh,Black,No,1
6,Samsung Galaxy A05,6.7,4,64,50MP,5000mAh,Black,No,2
7,Samsung GALAXY A15,6.5,4,128,50MP,5000mAh,Blue,No,1
8,Samsung Galaxy A15,6.5,6,128,50MP,5000mAh,Blue,No,2
9,Samsung Galaxy A05S,6.7,4,128,50MP,5000mAh,Silver,No,1


Done with Samsung phones. let us now move on to the list of other phones

In [266]:
# Cleaning up other phones
def clean_entries(entries):
    cleaned_entries = []
    for entry in entries:
        # Remove mathematical operators
        cleaned_entry = re.sub(r'[+\-]', '', entry)
        cleaned_entries.append(cleaned_entry)
    return cleaned_entries

# Clean the list
other_phones = clean_entries(other_phones)

other_phones

['XIAOMI Redmi A3, 6.71", 3GB RAM  64GB (Dual SIM), 5000mAh, Midnight Black (1YR WRTY)',
 'Tecno Spark 20, Android 13, 6.6", 128GB  4GB RAM(4GB Extended), 50MP, 5000mAh, Gravity Black',
 'Itel S23 6.6", 128GB  4GB RAM, 50MP Camera, (Dual SIM), 4G, 5000mAh  Starry Black',
 'Itel S23 6.6", 128GB  4GB RAM, 50MP Camera, (Dual SIM), 4G, 5000mAh  Mystery White (1YR WRTY)',
 'Infinix Smart 8 6.6" HD, 2GB RAM  64GB , Android 13 (Dual sim) 5000mAh  Timber Black',
 'XIAOMI Redmi 13C, 6.74"  8GB RAM  256GB ROM, 50MP AI Triple Camera, 5000mAh (Dual Sim)  Midnight Black  Free Gifts',
 'Itel S23 6.6", 128GB  4GB RAM, 50MP Camera, (Dual SIM), 4G, 5000mAh  Luxurious Gold',
 "Tecno Spark 20, 6.6'' HD, UP to 8GB RAM  128GB ROM (Dual Sim) 50MP, 5000 mAh  Cyber White",
 "Tecno Spark 20C  6.6''  256GB HDD  4GB RAM  (8GB RAM Extended)  50MP Rear/8 MP Front, 5000mAh 18W Gravity Black",
 'Infinix Smart 8 6.6" HD, 2GB RAM  64GB , Android 13 (Dual sim) 5000mAh  Shiny Gold',
 'Tecno POP 8, 6.6", 128GB ROM 4GB RA

In [None]:

# Feature extraction from the list of other phones
def extract_other_phones_info(spec):
    # Regular expressions for extracting relevant parts
    name_pattern = r'^(XIAOMI|Tecno|Infinix|Itel|Oale)\s+(\w+\s*\w*)'
    display_pattern = r'(\d+(\.\d+)?)\s*["\']'
    ram_pattern = r'(\d+)GB\s*RAM'
    storage_pattern = r'(\d+GB|(\d+MB))\s*(ROM|Storage)'
    camera_pattern = r'(\d+MP)'
    battery_pattern = r'(\d{4}mAh|\d{4}MAh)'
    sim_pattern = r'\(?(Dual\s+SIM|Single\s+SIM|Single)\)?'
    
    # Extract RAM and Storage
    gb_values = [int(m.group(1)) for m in re.finditer(r'(\d+)GB', spec)]
    
    ram = min(gb_values) if gb_values else None
    storage = max(gb_values) if gb_values else None

    # Extract color with a broader pattern
    color_pattern = r'(?i)[-\s](Black|White|Blue|Red|Green|Yellow|Gold|Silver|Gray|Pink)(?=\s|\(|$)'
    color_match = re.search(color_pattern, spec)
    color = color_match.group(1).strip() if color_match else None

    # Set a default display size if not found
    display_size = re.search(display_pattern, spec).group(1) if re.search(display_pattern, spec) else "6.0"

    # Extract name, ensuring it doesn't include the display size
    name_match = re.search(name_pattern, spec, re.IGNORECASE)
    if name_match:
        name = name_match.group(0).strip()
        # Remove any trailing numbers or characters (like display sizes)
        name = re.sub(r'\s*\d*$', '', name).strip()
    else:
        name = None

    return {
        "Name": name,
        "Display Size": display_size,
        "RAM": ram,
        "Storage": storage,
        "Camera": re.search(camera_pattern, spec).group(1) if re.search(camera_pattern, spec) else None,
        "Battery": re.search(battery_pattern, spec).group(1) if re.search(battery_pattern, spec) else None,
        "Color": color,
        "Warranty": "Yes" if "WRTY" in spec else "No",
        "Number of SIMs": 2 if re.search(sim_pattern, spec) and "Dual" in re.search(sim_pattern, spec).group(0) else 1
    }

# Extract information for other phones
other_phones_data = [extract_other_phones_info(spec) for spec in other_phones]

# Create a DataFrame
other_phones_df = pd.DataFrame(other_phones_data)

# Display the DataFrame
print("Other Phones:")
other_phones_df.head(20)

Other Phones:


Unnamed: 0,Name,Display Size,RAM,Storage,Camera,Battery,Color,Warranty,Number of SIMs
0,XIAOMI Redmi A,6.71,3,64,,5000mAh,Black,Yes,2
1,Tecno Spark,6.6,4,128,50MP,5000mAh,Black,No,1
2,Itel S23,6.6,4,128,50MP,5000mAh,Black,No,2
3,Itel S23,6.6,4,128,50MP,5000mAh,White,Yes,2
4,Infinix Smart,6.6,2,64,,5000mAh,Black,No,1
5,XIAOMI Redmi 13C,6.74,8,256,50MP,5000mAh,Black,No,1
6,Itel S23,6.6,4,128,50MP,5000mAh,Gold,No,2
7,Tecno Spark,6.6,8,128,50MP,,White,No,1
8,Tecno Spark 20C,6.6,4,256,50MP,5000mAh,Black,No,1
9,Infinix Smart,6.6,2,64,,5000mAh,Gold,No,1


In [275]:
# Merge the samsung phones dataframe into the other phones dataframe by concatination sortting them by name
smartphones_df = pd.concat([samsung_phones_df, other_phones_df]).sort_values("Name")


# # Merge the dataframes
# smartphones_df = pd.concat([samsung_phones_df, other_phones_df])

# Display the merged DataFrame
smartphones_df.head()

Unnamed: 0,Name,Display Size,RAM,Storage,Camera,Battery,Color,Warranty,Number of SIMs
7940,Infinix HOT 40i,6.0,4,128,50MP,,,No,1
6050,Infinix HOT 40i,6.0,4,128,50MP,,,No,1
440,Infinix HOT 40i,6.0,4,128,50MP,,,No,1
8720,Infinix HOT 40i,6.0,4,128,50MP,,,No,1
410,Infinix HOT 40i,6.0,4,128,50MP,,,No,1


In [273]:
# Let us check the name column
smartphones_df["Name"].value_counts()

Name
Itel S23                1800
Tecno POP               1200
Samsung Galaxy A05       900
Infinix Smart            900
XIAOMI Redmi 14C         900
Samsung Galaxy A15       600
Samsung  Galaxy A05S     600
XIAOMI Redmi A           600
Tecno Spark              600
Samsung GALAXY A15       600
XIAOMI Redmi Note        600
Infinix HOT 40i          300
Infinix Hot 40i          300
Itel A18                 300
Tecno Pova               300
Tecno Spark 20C          300
Oale POP                 300
XIAOMI Redmi 13C         300
Samsung Galaxy A05s      300
Tecno SPARK              300
Name: count, dtype: int64