In [None]:
import pandas as pd
import numpy as np

# Import Regular Expression
import re

# Loading the data
df = pd.read_csv("jumia_mobile_phones.csv", encoding="ISO-8859-1",index_col=0)
df2 = pd.read_csv("jumia_phone_catalog_popularity.csv")

# Display the first few rows of the data to get a sense of the data structure
df.head()

In [None]:
print("df shape:",df.shape)
print ("------------------------------------------------")
print("df2 shape:",df2.shape)
print ("------------------------------------------------")
print("df missing values",df.isna().sum())
print ("------------------------------------------------")
print("df2 missing values",df2.isna().sum())

#check for duplicates
print("df2 duplicates",df2.duplicated().sum())
print("------------------------------------------------")
print("df missing values",df.isna().sum())

In [None]:
# Merging the datasets together

merged_df = pd.merge(df, df2, on='Name',how='left')
print(merged_df.shape)
merged_df.head()

In [None]:
merged_df.isna().sum()

In [None]:
#drop null values

merged_df.dropna(inplace=True)
print(merged_df.shape)
merged_df.isna().sum()

In [None]:
# Get all the entries in the name column and assign them to a list called data
data = merged_df['Name']

# convert the data type of data to a list
data = data.tolist()

# Check the data type of data
type(data)

In [None]:
merged_df.isna().sum()

In [None]:
merged_df.dropna(inplace=True)
print(merged_df.shape)
merged_df.isna().sum()

In [None]:

#checking duplicates
merged_df.duplicated().sum()

In [None]:
merged_df.drop_duplicates()
print(merged_df.shape)
merged_df.duplicated().sum()
merged_df.head()

In [None]:
# Get all the entries in the name column and assign them to a list called data
data = df['Name']

# convert the data type of data to a list
data = data.tolist()

# Check the data type of data
type(data)

In [None]:
def clean_data(data):
    cleaned_data = []
    
    for entry in data:
        # Remove unwanted characters, keeping numbers and spaces
        cleaned_entry = re.sub(r"[,'+\-]", " ", entry)  # Replace commas, single quotes, and plus signs with space
        cleaned_entry = re.sub(r"\s+", " ", cleaned_entry)  # Replace multiple spaces with a single space
        cleaned_entry = cleaned_entry.strip()  # Remove leading and trailing spaces
        cleaned_data.append(cleaned_entry)
    
    return cleaned_data

# Clean the data
cleaned_data = clean_data(data)

cleaned_data

In [None]:
# Separate Samsung entries from other brands
samsung_phones = [spec for spec in data if "Samsung" in spec]
other_phones = [spec for spec in data if "Samsung" not in spec]

# Let us check the result
samsung_phones, other_phones

In [None]:
# Further cleaning to remove mathematical operators within the list
def clean_entries(entries):
    cleaned_entries = []
    for entry in entries:
        # Remove mathematical operators
        cleaned_entry = re.sub(r'[+\-]', '', entry)
        cleaned_entries.append(cleaned_entry)
    return cleaned_entries

# Clean the samsung phones list
samsung_phones = clean_entries(samsung_phones)


# Display the cleaned Samsung phone specifications
samsung_phones

In [None]:
# Feature extraction from the samsung phones using regular expressions
def extract_samsung_info(spec):
    # Regular expressions for extracting relevant parts
    name_pattern = r'^(Samsung\s+Galaxy\s+\w+\s*\w*)'  # Without the case-insensitivity flag
    display_pattern = r'(\d+(\.\d+)?)\s*["\']'
    ram_pattern = r'(\d+)GB RAM'
    storage_pattern = r'(\d+GB)\s*ROM'
    camera_pattern = r'(\d+MP)'
    battery_pattern = r'(\d{4}mAh)'
    sim_pattern = r'\(?(Dual\s+Sim|DUAL\s+SIM|Single\s+Sim|Single)\)?'
    
    # Extract all the numbers that are followed by GB
    gb_values = [int(m.group(1)) for m in re.finditer(r'(\d+)GB', spec)]

    # Determine RAM and Storage based on size
    ram = min(gb_values) if gb_values else None
    storage = max(gb_values) if gb_values else None

    # Updated color pattern to ensure it captures correctly
    color_pattern = r'(?i)[-\s](Black|White|Blue|Red|Green|Yellow|Gold|Silver|Gray|Pink)(?=\s|\(|$)'
    color_match = re.search(color_pattern, spec)
    color = color_match.group(1).strip() if color_match else None

    # Set a default display size if not found
    display_size = re.search(display_pattern, spec).group(1) if re.search(display_pattern, spec) else "6.0"

    # Use case-insensitive search for name
    name_match = re.search(name_pattern, spec, re.IGNORECASE)
    
    return {
        "Name": name_match.group(0).strip() if name_match else None,
        "Display Size": display_size,
        "RAM": ram,
        "Storage": storage,
        "Camera": re.search(camera_pattern, spec).group(1) if re.search(camera_pattern, spec) else None,
        "Battery": re.search(battery_pattern, spec).group(1) if re.search(battery_pattern, spec) else None,
        "Color": color,
        "Warranty": "Yes" if "WRTY" in spec else "No",
        "Number of SIMs": 2 if re.search(sim_pattern, spec) and "Dual" in re.search(sim_pattern, spec).group(0) else 1
    }

# Extract information for Samsung phones
samsung_phones_data = [extract_samsung_info(spec) for spec in samsung_phones]

# Create a DataFrame
samsung_phones_df = pd.DataFrame(samsung_phones_data)

# Display the DataFrame
print("Samsung Phones:")
samsung_phones_df.head(10)

In [None]:
# Cleaning up other phones
def clean_entries(entries):
    cleaned_entries = []
    for entry in entries:
        # Remove mathematical operators
        cleaned_entry = re.sub(r'[+\-]', '', entry)
        cleaned_entries.append(cleaned_entry)
    return cleaned_entries

# Clean the list
other_phones = clean_entries(other_phones)

other_phones

In [None]:
# Feature extraction from the list of other phones
def extract_other_phones_info(spec):
    # Regular expressions for extracting relevant parts
    name_pattern = r'^(XIAOMI|Tecno|Infinix|Itel|Oale)\s+(\w+\s*\w*)'
    display_pattern = r'(\d+(\.\d+)?)\s*["\']'
    ram_pattern = r'(\d+)GB\s*RAM'
    storage_pattern = r'(\d+GB|(\d+MB))\s*(ROM|Storage)'
    camera_pattern = r'(\d+MP)'
    battery_pattern = r'(\d{4}mAh|\d{4}MAh)'
    sim_pattern = r'\(?(Dual\s+SIM|Single\s+SIM|Single)\)?'
    
    # Extract RAM and Storage
    gb_values = [int(m.group(1)) for m in re.finditer(r'(\d+)GB', spec)]
    
    ram = min(gb_values) if gb_values else None
    storage = max(gb_values) if gb_values else None

    # Extract color with a broader pattern
    color_pattern = r'(?i)[-\s](Black|White|Blue|Red|Green|Yellow|Gold|Silver|Gray|Pink)(?=\s|\(|$)'
    color_match = re.search(color_pattern, spec)
    color = color_match.group(1).strip() if color_match else None

    # Set a default display size if not found
    display_size = re.search(display_pattern, spec).group(1) if re.search(display_pattern, spec) else "6.0"

    # Extract name, ensuring it doesn't include the display size
    name_match = re.search(name_pattern, spec, re.IGNORECASE)
    if name_match:
        name = name_match.group(0).strip()
        # Remove any trailing numbers or characters (like display sizes)
        name = re.sub(r'\s*\d*$', '', name).strip()
    else:
        name = None

    return {
        "Name": name,
        "Display Size": display_size,
        "RAM": ram,
        "Storage": storage,
        "Camera": re.search(camera_pattern, spec).group(1) if re.search(camera_pattern, spec) else None,
        "Battery": re.search(battery_pattern, spec).group(1) if re.search(battery_pattern, spec) else None,
        "Color": color,
        "Warranty": "Yes" if "WRTY" in spec else "No",
        "Number of SIMs": 2 if re.search(sim_pattern, spec) and "Dual" in re.search(sim_pattern, spec).group(0) else 1
    }

# Extract information for other phones
other_phones_data = [extract_other_phones_info(spec) for spec in other_phones]

# Create a DataFrame
other_phones_df = pd.DataFrame(other_phones_data)

# Display the DataFrame
print("Other Phones:")
other_phones_df.head(20)

In [None]:
# Merge the samsung phones dataframe into the other phones dataframe by concatination sortting them by name
smartphones_df = pd.concat([samsung_phones_df, other_phones_df]).sort_values("Name")


# # Merge the dataframes
# smartphones_df = pd.concat([samsung_phones_df, other_phones_df])

# Display the merged DataFrame
smartphones_df.head()

In [None]:
# Let us check the name column
smartphones_df["Name"].value_counts()