In [None]:
## Samsung Data Cleaning


In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('samsung_mobile_new_data.csv')

# Display the first few rows of the dataframe
df.head()

In [None]:
# Check for missing data
df.isnull().sum()

In [None]:
# Check for duplicate rows
df.duplicated().sum()

In [None]:
# Drop duplicate rows
df = df.drop_duplicates().reset_index(drop=True)

# Display the first few rows of the dataframe
df.head()

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Print unique values in each categorical column
for col in categorical_cols:
    print(f'\nUnique values in {col}:\n{df[col].unique()}'

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Print unique values in each categorical column
for col in categorical_cols:
    print(f'\nUnique values in {col}:\n{df[col].unique()}')

In [None]:
# Create new columns
df['Brand'] = df.name.apply(lambda x: x.split(' ')[0])

# Extract model, color, storage, and network from 'name' column
df['Model'] = df['name'].str.extract(r'(.+?) \(')
df['Color'] = df['name'].str.extract(r'\((.+?) \(')
df['Storage'] = df['name'].str.extract(r'(\d+GB)')
df['Network'] = df['name'].str.extract(r'(?=.*\b5G\b)(\b5G\b)')

# Extract features from 'storage_ram' column
df[['Internal_Storage', 'RAM']] = df['storage_ram'].str.extract(r'Internal Storage(\d+ GB)RAM(\d+ GB)')
df['Expandable_Storage'] = df['storage_ram'].str.extract(r'Expandable Storage(\d+ TB)')
df['Supported_Memory_Card_Type'] = df['storage_ram'].str.extract(r'Supported Memory Card Type(MIcroSD)')
df['Memory_Card_Slot_Type'] = df['storage_ram'].str.extract(r'Memory Card Slot Type(Dedicated Slot)')
df['Call_Log_Memory'] = df['storage_ram'].str.extract(r'Call Log Memory(Yes)')

# Display the first few rows of the dataframe
df.head()

In [None]:
# Clean 'Supported_Memory_Card_Type' column
df['Supported_Memory_Card_Type'] = df['Supported_Memory_Card_Type'].str.replace('Memory','')

# Clean 'Memory_Card_Slot_Type' column
df['Memory_Card_Slot_Type'] = df['Memory_Card_Slot_Type'].str.replace('Memory', '')

# Simplify 'Memory_Card_Slot_Type' column
def simplify_memory_card_slot(value):
    if pd.notna(value):
        if 'Hybrid' in value:
            return 'Hybrid Slot'
        elif 'Dedicated' in value:
            return 'Dedicated Slot'
    return np.nan

df['Memory_Card_Slot_Simplified'] = df['Memory_Card_Slot_Type'].apply(simplify_memory_card_slot)

# Drop the original 'Memory_Card_Slot_Type' column
df.drop(columns=['Memory_Card_Slot_Type'], inplace=True)

In [None]:
# Extract features from 'os_processor' column
df['Operating_System'] = df['os_processor'].str.extract(r'(Operating System\w+)')
df['OS_Version'] = df['os_processor'].str.extract(r'(\d+)')
df['Chipset'] = df['os_processor'].str.extract(r'(Chipset\w+)')
df['CPU'] = df['os_processor'].str.extract(r'(CPU\w+)')

# Extract features from 'camera' column
df['Primary_Camera'] = df['camera'].apply(lambda x: 'Yes' if 'Primary Camera AvailableYes' in x else 'No')
df['Secondary_Camera'] = df['camera'].apply(lambda x: 'Yes' if 'Secondary Camera AvailableYes' in x else 'No')
df['Dual_Camera'] = df['camera'].apply(lambda x: 'Yes' if 'Dual Camera Setup' in x else 'No')
df['Video_4K'] = df['camera'].apply(lambda x: 'Yes' if '4K' in x else 'No')

In [None]:
# Fill missing values with 'None'
df['os_processor'].fillna('None', inplace=True)
df['camera'].fillna('None', inplace=True)

# Extract features from 'os_processor' column
df['Operating_System'] = df['os_processor'].str.extract(r'(Operating System\w+)')
df['OS_Version'] = df['os_processor'].str.extract(r'(\d+)')
df['Chipset'] = df['os_processor'].str.extract(r'(Chipset\w+)')
df['CPU'] = df['os_processor'].str.extract(r'(CPU\w+)')

# Extract features from 'camera' column
df['Primary_Camera'] = df['camera'].apply(lambda x: 'Yes' if 'Primary Camera AvailableYes' in x else 'No')
df['Secondary_Camera'] = df['camera'].apply(lambda x: 'Yes' if 'Secondary Camera AvailableYes' in x else 'No')
df['Dual_Camera'] = df['camera'].apply(lambda x: 'Yes' if 'Dual Camera Setup' in x else 'No')
df['Video_4K'] = df['camera'].apply(lambda x: 'Yes' if '4K' in x else 'No')

In [None]:
# Fill missing values with 'None'
df['display'].fillna('None', inplace=True)

# Extract features from 'display' column
df['Display_Size'] = df['display'].str.extract(r'(\d+\.\d+ cm)')
df['Display_Type_LED'] = df['display'].apply(lambda x: 'Yes' if 'LED' in x else 'No')
df['Display_Type_LCD'] = df['display'].apply(lambda x: 'Yes' if 'LCD' in x else 'No')
df['Display_Type_AMOLED'] = df['display'].apply(lambda x: 'Yes' if 'AMOLED' in x else 'No')
df['Resolution'] = df['display'].str.extract(r'(\d+x\d+ Pixels)')

In [None]:
# Fill missing values with 'None'
df['network'].fillna('None', inplace=True)

# Extract features from 'network' column
df['Network_Type_2G'] = df['network'].apply(lambda x: 'Yes' if '2G' in x else 'No')
df['Network_Type_3G'] = df['network'].apply(lambda x: 'Yes' if '3G' in x else 'No')
df['Network_Type_4G'] = df['network'].apply(lambda x: 'Yes' if '4G' in x else 'No')
df['Network_Type_5G'] = df['network'].apply(lambda x: 'Yes' if '5G' in x else 'No')
df['Supported_Networks_GSM'] = df['network'].apply(lambda x: 'Yes' if 'GSM' in x else 'No')
df['Supported_Networks_LTE'] = df['network'].apply(lambda x: 'Yes' if 'LTE' in x else 'No')
df['Internet_Connectivity_WiFi'] = df['network'].apply(lambda x: 'Yes' if 'Wi-Fi' in x else 'No')
df['Internet_Connectivity_GPRS'] = df['network'].apply(lambda x: 'Yes' if 'GPRS' in x else 'No')
df['Bluetooth_Support'] = df['network'].apply(lambda x: 'Yes' if 'Bluetooth SupportYes' in x else 'No')

In [None]:
# Fill missing values with 'None'
df['battery'].fillna('None', inplace=True)

# Extract features from 'battery' column
df['Battery_Capacity'] = df['battery'].str.extract(r'(\d+ mAh)')
df['Battery_Type_Li_Ion'] = df['battery'].apply(lambda x: 'Yes' if 'Li-Ion' in x else 'No')
df['Battery_Type_Li_Poly'] = df['battery'].apply(lambda x: 'Yes' if 'Li-Poly' in x else 'No')
df['Wireless_Charging'] = df['battery'].apply(lambda x: 'Yes' if 'Wireless Charging' in x else 'No')
df['Fast_Charging'] = df['battery'].apply(lambda x: 'Yes' if 'Fast Charging' in x else 'No')

In [None]:
# Clean 'Model' column
df['Model'] = df['Model'].str.replace('SAMSUNG', '')

In [None]:
# Count the unique values in the 'Network' column
df['Network'].value_counts(dropna=True)

In [None]:
# Clean 'Model' column
df['Model'] = df['Model'].str.replace('5G', '')

In [None]:
# Count the total occurrences of '5G' in the 'Model' column
total_5g_occurrences = df['Model'].str.count('5G').sum()
total_5g_occurrences