### Data Preprocessing

In [1]:
import pandas as pd
import ast
import re
import numpy as np

In [2]:
# First, let's check the file encoding
import pandas as pd

# Try reading with different encodings if needed
try:
    df = pd.read_csv(
        "C:/Users/mahmu/Downloads/DataAnalyticsProjects/product-recommender-ai/data/mobiledokan_products.csv",
        quotechar='"',
        quoting=1,  # QUOTE_MINIMAL
        on_bad_lines='warn',  # This replaces error_bad_lines in newer versions
        encoding='utf-8'
    )
    print("Successfully read the file with UTF-8 encoding")
except UnicodeDecodeError:
    try:
        df = pd.read_csv(
            "C:/Users/mahmu/Downloads/DataAnalyticsProjects/product-recommender-ai/data/mobiledokan_products.csv",
            quotechar='"',
            quoting=1,
            on_bad_lines='warn',
            encoding='latin1'  # Try latin1 if UTF-8 fails
        )
        print("Successfully read the file with latin1 encoding")
    except Exception as e:
        print(f"Error reading file: {e}")

# Display the first few rows if successful
if 'df' in locals():
    display(df.head())

Successfully read the file with UTF-8 encoding


Unnamed: 0,name,brand,model,price,url,img_url,display_type,screen_size_inches,display_resolution,pixel_density_ppi,...,face_unlock,light_sensor,infrared,fm_radio,operating_system,os_version,user_interface,status,made_by,release_date
0,Realme Neo7 Turbo,Realme,Neo7 Turbo,"?.35,000(Expected)",https://www.mobiledokan.com/mobile/realme-neo7...,https://www.mobiledokan.com/media/realme-neo7-...,AMOLED,6.8 inches (17.27 cm),1280x2800 px (FHD+),453 ppi,...,Yes,"Light sensor, Proximity sensor, Accelerometer,...",Yes,,Android,v15,,Upcoming,China,Exp. 29 May 2025
1,Tecno Spark 40,Tecno,Spark 40,"?.20,000(Expected)",https://www.mobiledokan.com/mobile/tecno-spark-40,https://www.mobiledokan.com/media/tecno-spark-...,IPS LCD,6.78 inches (17.22 cm),1080x2520 px (FHD+),,...,Yes,"Light sensor, Proximity sensor, Accelerometer,...",,,Android,v15,HIOS 15,Rumored,China,Not announced yet
2,Vivo S30,Vivo,S30,"?.50,000(Expected)",https://www.mobiledokan.com/mobile/vivo-s30,https://www.mobiledokan.com/media/vivo-s30-blu...,AMOLED,6.67 inches (16.94 cm),1260x2800 px (FHD+),460 ppi,...,Yes,"Light sensor, Proximity sensor, Accelerometer,...",Yes,,Android,v15,OriginOS 5,Upcoming,China,Exp. 29 May 2025
3,OnePlus Ace 5 Ultra,OnePlus,Ace 5 Ultra,"?.40,000(Expected)",https://www.mobiledokan.com/mobile/oneplus-ace...,https://www.mobiledokan.com/media/oneplus-ace-...,LTPO AMOLED,6.83 inches (17.35 cm),1272x2800 px (FHD+),450 ppi,...,Yes,"Light sensor, Proximity sensor, Accelerometer,...",Yes,,Android,v15,ColorOS 15,Rumored,China,Not announced yet
4,Motorola Moto G86 Power,Motorola,Moto G86 Power,"?.50,000(Expected)",https://www.mobiledokan.com/mobile/motorola-mo...,https://www.mobiledokan.com/media/motorola-mot...,P-OLED,6.67 inches (16.94 cm),1220x2712 px (FHD+),446 ppi,...,Yes,"Light sensor, Proximity sensor, Accelerometer,...",,,Android,v15,,Upcoming,USA,Exp. 10 June 2025


### Data Cleaning

In [3]:
# Clean price column by removing "?." prefix
df['price'] = df['price'].str.replace('?.', '', regex=False)

In [4]:
# Create price_original column with numeric values
df['price_original'] = df['price'].str.replace(',', '').str.extract('(\d+)').astype(float)

In [5]:
df[['price','price_original']].head(10)

Unnamed: 0,price,price_original
0,"35,000(Expected)",35000.0
1,"20,000(Expected)",20000.0
2,"50,000(Expected)",50000.0
3,"40,000(Expected)",40000.0
4,"50,000(Expected)",50000.0
5,"7,999(Official)",7999.0
6,"45,000(Expected)",45000.0
7,"15,000(Expected)",15000.0
8,"60,000(Expected)",60000.0
9,"130,000(Unofficial)",130000.0


##### Price-based Features

In [6]:
# Price categories
df['price_category'] = pd.cut(df['price_original'], 
    bins=[0, 20000, 40000, 60000, 100000, float('inf')],
    labels=['Budget', 'Mid-range', 'Upper Mid-range', 'Premium', 'Flagship'])

df[["name", "price", "price_category"]].head(10)

Unnamed: 0,name,price,price_category
0,Realme Neo7 Turbo,"35,000(Expected)",Mid-range
1,Tecno Spark 40,"20,000(Expected)",Budget
2,Vivo S30,"50,000(Expected)",Upper Mid-range
3,OnePlus Ace 5 Ultra,"40,000(Expected)",Mid-range
4,Motorola Moto G86 Power,"50,000(Expected)",Upper Mid-range
5,Walton Orbit Y13,"7,999(Official)",Budget
6,Motorola Moto G86,"45,000(Expected)",Upper Mid-range
7,Acer Super ZX,"15,000(Expected)",Budget
8,Realme GT 7,"60,000(Expected)",Upper Mid-range
9,Samsung Galaxy S25 Edge,"130,000(Unofficial)",Flagship


In [7]:
# Function to convert storage to GB
def convert_to_gb(value):
    if pd.isna(value):
        return None
    value = str(value).lower()
    if 'gb' in value:
        return float(value.replace('gb', '').strip())
    elif 'mb' in value:
        return float(value.replace('mb', '').strip()) / 1024
    else:
        return None

# Function to convert RAM to GB
def convert_ram_to_gb(value):
    if pd.isna(value):
        return None
    value = str(value).lower()
    # Extract numbers from the string
    import re
    numbers = re.findall(r'\d+', value)
    if numbers:
        num = float(numbers[0])
        if 'gb' in value:
            return num
        elif 'mb' in value:
            return num / 1024
        else:
            # If no unit specified, assume GB
            return num
    return None

# Convert storage and RAM to GB
df['storage_gb'] = df['internal_storage'].apply(convert_to_gb)
df['ram_gb'] = df['ram'].apply(convert_ram_to_gb)

# Calculate price per GB and round to 2 decimal places
df['price_per_gb'] = (df['price_original'] / df['storage_gb']).round(2)
df['price_per_gb_ram'] = (df['price_original'] / df['ram_gb']).round(2)

# Display results
df[["name", "price", "internal_storage", "ram", 
          "storage_gb", "ram_gb", "price_per_gb", "price_per_gb_ram"]].head(10)

Unnamed: 0,name,price,internal_storage,ram,storage_gb,ram_gb,price_per_gb,price_per_gb_ram
0,Realme Neo7 Turbo,"35,000(Expected)",256 GB,12 GB,256.0,12.0,136.72,2916.67
1,Tecno Spark 40,"20,000(Expected)",128 GB,8 GB,128.0,8.0,156.25,2500.0
2,Vivo S30,"50,000(Expected)",256 GB,12 GB,256.0,12.0,195.31,4166.67
3,OnePlus Ace 5 Ultra,"40,000(Expected)",256 GB,12 GB,256.0,12.0,156.25,3333.33
4,Motorola Moto G86 Power,"50,000(Expected)",512 GB,8 GB,512.0,8.0,97.66,6250.0
5,Walton Orbit Y13,"7,999(Official)",64 GB,4 GB,64.0,4.0,124.98,1999.75
6,Motorola Moto G86,"45,000(Expected)",256 GB,8 GB,256.0,8.0,175.78,5625.0
7,Acer Super ZX,"15,000(Expected)",128 GB,4 GB,128.0,4.0,117.19,3750.0
8,Realme GT 7,"60,000(Expected)",256 GB,8 GB,256.0,8.0,234.38,7500.0
9,Samsung Galaxy S25 Edge,"130,000(Unofficial)",256 GB,12 GB,256.0,12.0,507.81,10833.33


##### Display-based Features

In [8]:
# Screen size in numeric format
df['screen_size_numeric'] = df['screen_size_inches'].str.extract('(\d+\.?\d*)').astype(float)

# Resolution in numeric format (width x height)
df['resolution_width'] = df['display_resolution'].str.extract('(\d+)x').astype(float)
df['resolution_height'] = df['display_resolution'].str.extract('x(\d+)').astype(float)

# Pixels per inch (PPI) in numeric format
df['ppi_numeric'] = df['pixel_density_ppi'].str.extract('(\d+)').astype(float)

# Refresh rate in numeric format
df['refresh_rate_numeric'] = df['refresh_rate_hz'].str.extract('(\d+)').astype(float)

df[["name", "screen_size_inches", "display_resolution", "pixel_density_ppi", "refresh_rate_hz", 
  'screen_size_numeric', 'resolution_width', 'resolution_height', 'ppi_numeric', 'refresh_rate_numeric']].head()


Unnamed: 0,name,screen_size_inches,display_resolution,pixel_density_ppi,refresh_rate_hz,screen_size_numeric,resolution_width,resolution_height,ppi_numeric,refresh_rate_numeric
0,Realme Neo7 Turbo,6.8 inches (17.27 cm),1280x2800 px (FHD+),453 ppi,120 Hz,6.8,1280.0,2800.0,453.0,120.0
1,Tecno Spark 40,6.78 inches (17.22 cm),1080x2520 px (FHD+),,120 Hz,6.78,1080.0,2520.0,,120.0
2,Vivo S30,6.67 inches (16.94 cm),1260x2800 px (FHD+),460 ppi,120 Hz,6.67,1260.0,2800.0,460.0,120.0
3,OnePlus Ace 5 Ultra,6.83 inches (17.35 cm),1272x2800 px (FHD+),450 ppi,120 Hz,6.83,1272.0,2800.0,450.0,120.0
4,Motorola Moto G86 Power,6.67 inches (16.94 cm),1220x2712 px (FHD+),446 ppi,120 Hz,6.67,1220.0,2712.0,446.0,120.0


In [9]:
def get_display_score(phone):
    score = 0
    weights = {
        'resolution': 0.4,
        'ppi': 0.3,
        'refresh_rate': 0.3
    }
    
    # Higher resolution is better
    if not pd.isna(phone['resolution_width']) and not pd.isna(phone['resolution_height']):
        resolution_score = (phone['resolution_width'] * phone['resolution_height']) / 1000000
        score += resolution_score * weights['resolution']
    
    # Higher PPI is better
    if not pd.isna(phone['ppi_numeric']):
        ppi_score = phone['ppi_numeric'] / 100
        score += ppi_score * weights['ppi']
    
    # Higher refresh rate is better
    if not pd.isna(phone['refresh_rate_numeric']):
        refresh_score = phone['refresh_rate_numeric'] / 60
        score += refresh_score * weights['refresh_rate']
    
    # Normalize score based on available features
    available_features = sum(1 for x in [phone['ppi_numeric'], 
                                       phone['refresh_rate_numeric'],
                                       phone['resolution_width']] if not pd.isna(x))
    if available_features > 0:
        score = score / available_features
    
    return round(score, 4)

# Add display score to dataframe
df['display_score'] = df.apply(get_display_score, axis=1)

# Display results
df[["name", "display_score"]].head()

Unnamed: 0,name,display_score
0,Realme Neo7 Turbo,1.1309
1,Tecno Spark 40,0.8443
2,Vivo S30,1.1304
3,OnePlus Ace 5 Ultra,1.1249
4,Motorola Moto G86 Power,1.0872


##### Camera-based Features

In [10]:
# Function to convert camera setup to count
def get_camera_count(value):
    if pd.isna(value):
        return 0
    value = str(value).lower()
    if value == 'single':
        return 1
    elif value == 'dual':
        return 2
    elif value == 'triple':
        return 3
    elif value == 'quad':
        return 4
    else:
        return 0

# Number of cameras
df['camera_count'] = df['camera_setup'].apply(get_camera_count)

# Display results
print(df[["name", "camera_setup", "camera_count"]].head())

                      name camera_setup  camera_count
0        Realme Neo7 Turbo         Dual             2
1           Tecno Spark 40       Single             1
2                 Vivo S30       Triple             3
3      OnePlus Ace 5 Ultra         Dual             2
4  Motorola Moto G86 Power         Dual             2


In [11]:

def extract_primary_camera_mp(value):
    if pd.isna(value):
        return None
    
    value_str = str(value).strip()
    if not value_str:
        return None

    # Try to extract explicit MP values first
    mp_match = re.search(r'(\d+\.?\d*)\s*MP', value_str, re.IGNORECASE)
    if mp_match:
        return float(mp_match.group(1))

    # If no explicit MP, try to extract pixel dimensions and convert to MP
    pixel_match = re.search(r'(\d+)\s*x\s*(\d+)\s*Pixels', value_str, re.IGNORECASE)
    if pixel_match:
        width = int(pixel_match.group(1))
        height = int(pixel_match.group(2))
        # Convert pixels to megapixels (round to 2 decimal places)
        return round((width * height) / 1_000_000, 0)

    # Return None if neither format matches
    return None

# Apply the function to create the 'primary_camera_mp' column
df['primary_camera_mp'] = df['primary_camera_resolution'].apply(extract_primary_camera_mp)

# Display results (showing original and new column)
print(df[["primary_camera_resolution", "primary_camera_mp"]].tail(15))

                              primary_camera_resolution  primary_camera_mp
5075                                 8150 x 6150 Pixels               50.0
5076  50 MP, f/1.8, Wide Angle Camera, 8 MP, f/2.4, ...               50.0
5077                                 8150 x 6150 Pixels               50.0
5078                                 8150 x 6150 Pixels               50.0
5079                                 8150 x 6150 Pixels               50.0
5080                                 8150 x 6150 Pixels               50.0
5081                                 8150 x 6150 Pixels               50.0
5082                                 8150 x 6150 Pixels               50.0
5083                                 4128 x 3096 Pixels               13.0
5084                                 8150 x 6150 Pixels               50.0
5085                                 8150 x 6150 Pixels               50.0
5086                               16000 x 12500 Pixels              200.0
5087                     

In [12]:
def extract_selfie_camera_mp(value):
    if pd.isna(value):
        return None
    
    value_str = str(value).strip()
    if not value_str:
        return None

    # Use regex to find a number (integer or float) followed by 'MP'
    mp_match = re.search(r'(\d+\.?\d*)\s*MP', value_str, re.IGNORECASE)
    if mp_match:
        return float(mp_match.group(1))
    
    # Return None if no MP value is found
    return None

# Apply the function to create the 'selfie_camera_mp' column
df['selfie_camera_mp'] = df['selfie_camera_resolution'].apply(extract_selfie_camera_mp)

# Display results (showing original and new column)
print(df[["selfie_camera_resolution", "selfie_camera_mp"]].head(10))

                   selfie_camera_resolution  selfie_camera_mp
0  16 MP, f/2.4, Wide Angle, Primary Camera              16.0
1                     13 MP, Primary Camera              13.0
2  50 MP, f/2.0, Wide Angle, Primary Camera              50.0
3  16 MP, f/2.4, Wide Angle, Primary Camera              16.0
4  32 MP, f/2.2, Wide Angle, Primary Camera              32.0
5               5 MP, f/2.0, Primary Camera               5.0
6  32 MP, f/2.2, Wide Angle, Primary Camera              32.0
7                     13 MP, Primary Camera              13.0
8  32 MP, f/2.5, Wide Angle, Primary Camera              32.0
9  12 MP, f/2.2, Wide Angle, Primary Camera              12.0


In [13]:
import pandas as pd
import numpy as np # Import numpy for nan handling

def get_camera_score(phone):
    score = 0
    weights = {
        'camera_count': 0.2,
        'primary_camera_mp': 0.5,
        'selfie_camera_mp': 0.3
    }
    
    # Add score based on available features, applying weights
    if not pd.isna(phone['camera_count']) and phone['camera_count'] > 0:
         # Simple scaling for camera count (e.g., max 4 cameras -> score / 4)
        score += (phone['camera_count'] / 4) * weights['camera_count'] # Assuming max ~4 is high-end

    if not pd.isna(phone['primary_camera_mp']):
        # Simple scaling for MP (e.g., divide by a high-end value like 108 or 200)
        score += (phone['primary_camera_mp'] / 200) * weights['primary_camera_mp'] # Assuming max ~200MP

    if not pd.isna(phone['selfie_camera_mp']):
        # Simple scaling for MP
        score += (phone['selfie_camera_mp'] / 64) * weights['selfie_camera_mp'] # Assuming max ~64MP


    return round(score, 4) # Round to a few decimal places

# Add camera score to dataframe
df['camera_score'] = df.apply(get_camera_score, axis=1)

# Display results
df[["name", "camera_setup", "primary_camera_resolution", "selfie_camera_resolution", 
          "camera_count", "primary_camera_mp", "selfie_camera_mp", "camera_score"]].head()

Unnamed: 0,name,camera_setup,primary_camera_resolution,selfie_camera_resolution,camera_count,primary_camera_mp,selfie_camera_mp,camera_score
0,Realme Neo7 Turbo,Dual,8150 x 6150 Pixels,"16 MP, f/2.4, Wide Angle, Primary Camera",2,50.0,16.0,0.3
1,Tecno Spark 40,Single,9000 x 7000 Pixels,"13 MP, Primary Camera",1,63.0,13.0,0.2684
2,Vivo S30,Triple,8150 x 6150 Pixels,"50 MP, f/2.0, Wide Angle, Primary Camera",3,50.0,50.0,0.5094
3,OnePlus Ace 5 Ultra,Dual,8150 x 6150 Pixels,"16 MP, f/2.4, Wide Angle, Primary Camera",2,50.0,16.0,0.3
4,Motorola Moto G86 Power,Dual,8150 x 6150 Pixels,"32 MP, f/2.2, Wide Angle, Primary Camera",2,50.0,32.0,0.375


##### Battery-based Features

In [14]:
# Battery capacity in numeric format
df['battery_capacity_numeric'] = df['capacity'].str.extract('(\d+)').astype(float)

# Has fast charging (boolean)
df['has_fast_charging'] = df['quick_charging'].notna()

# Has wireless charging (boolean)
df['has_wireless_charging'] = df['wireless_charging'].notna()

In [15]:
# Function to extract wattage from quick_charging string
def extract_wattage(value):
    if pd.isna(value):
        return None
    
    value_str = str(value).strip()
    if not value_str:
        return None
    
    # Use regex to find a number (integer or float) followed by 'W'
    wattage_match = re.search(r'(\d+\.?\d*)\s*W', value_str, re.IGNORECASE)
    if wattage_match:
        return float(wattage_match.group(1))
    
    return None

# Apply the function to create the 'charging_wattage' column
df['charging_wattage'] = df['quick_charging'].apply(extract_wattage)


In [16]:
# Calculate the 99th percentile for battery capacity, ignoring NaNs
# Use .quantile(0.99) or .quantile(0.95) depending on how many outliers you want to ignore
battery_capacity_99_percentile = df['battery_capacity_numeric'].quantile(0.99)

# Calculate the max wattage (percentile might be less necessary here unless there are extreme wattage outliers)
# Let's stick with max wattage for now, or you could use percentile here too if needed.
max_wattage = df['charging_wattage'].max()

# Define the battery scoring function using the percentile for capacity scaling
def get_battery_score_percentile(phone, capacity_percentile, max_wattage_val):
    score = 0
    weights = {
        'capacity': 0.5,
        'charging_wattage': 0.4,
        'wireless_charging': 0.1
    }

    # Scale by the calculated percentile for capacity
    # Ensure percentile > 0 to avoid division by zero
    if not pd.isna(phone['battery_capacity_numeric']) and capacity_percentile > 0:
        # Cap the value at the percentile to avoid scores > weight for extreme outliers
        scaled_capacity = min(phone['battery_capacity_numeric'], capacity_percentile) / capacity_percentile
        score += scaled_capacity * weights['capacity']

    # Scale by the max wattage (ensure max_wattage_val > 0)
    if not pd.isna(phone['charging_wattage']) and max_wattage_val > 0:
        # Cap the value at the max wattage to avoid scores > weight for extreme outliers
        scaled_wattage = min(phone['charging_wattage'], max_wattage_val) / max_wattage_val
        score += scaled_wattage * weights['charging_wattage']


    # Add a bonus if it has wireless charging
    # Ensure has_wireless_charging is treated as boolean or handle differently
    if 'has_wireless_charging' in phone and not pd.isna(phone['has_wireless_charging']) and phone['has_wireless_charging']:
        score += 1 * weights['wireless_charging']

    return round(score, 4)

# Apply the function, passing the pre-calculated percentile and max wattage
df['battery_score'] = df.apply(
    lambda row: get_battery_score_percentile(row, battery_capacity_99_percentile, max_wattage),
    axis=1
)

# Display results (showing relevant columns and the new score)
df[["name", "capacity", "quick_charging", "wireless_charging",
          "battery_capacity_numeric", "charging_wattage", "has_wireless_charging", "battery_score"]].head()

# Optional: Display some phones with high battery capacity to check the scores
print("\nPhones with high battery capacity:")
print(df.nlargest(10, 'battery_capacity_numeric')[["name", "capacity", "battery_capacity_numeric", "battery_score"]])


Phones with high battery capacity:
                          name   capacity  battery_capacity_numeric  \
5086   Nokia Dragon Pro (2024)  71000 mAh                   71000.0   
480   Energizer Hard Case P28K  28000 mAh                   28000.0   
169       Ulefone Armor 34 Pro  25500 mAh                   25500.0   
177       Ulefone Armor 33 Pro  25500 mAh                   25500.0   
486           Doogee V Max Pro  22000 mAh                   22000.0   
654          Doogee V Max Plus  22000 mAh                   22000.0   
1060          Oukitel WP33 Pro  22000 mAh                   22000.0   
166          Doogee Fire 6 Max  20800 mAh                   20800.0   
359        Doogee Fire 6 Power  15500 mAh                   15500.0   
294              Oscal Pilot 1  15000 mAh                   15000.0   

      battery_score  
5086          0.555  
480           0.555  
169           0.610  
177           0.610  
486           0.555  
654           0.555  
1060          0.555  
166   

##### Performance-based Features

In [22]:
import pandas as pd
import re
import numpy as np

# --- Step 1: Load Processor Rankings from CSV ---
processor_rankings = {}
try:
    # Read the CSV file
    processor_df = pd.read_csv('processor_ratings.csv')
    
    # Create a dictionary mapping processor information
    processor_rankings = {}
    for _, row in processor_df.iterrows():
        # For processors with "Other" company, only use processor name
        if row['Company'].lower() == 'other':
            key = row['Processor'].lower().strip()
        else:
            # For other companies, use company + processor
            key = f"{row['Company']} {row['Processor']}".lower().strip()
            
        # Store rank, GPU, and company info
        processor_rankings[key] = {
            'rank': row['Rank'],
            'gpu': row['GPU'].lower().strip() if pd.notna(row['GPU']) else None,
            'company': row['Company'].lower().strip(),
            'processor_name': row['Processor'].lower().strip() 
        }
    
    print(f"Successfully loaded {len(processor_rankings)} processor rankings from processor_ratings.csv")
except FileNotFoundError:
    print("Error: processor_ratings.csv not found. Processor rankings will not be used.")
except Exception as e:
    print(f"Error loading processor_ratings.csv: {str(e)}")

def get_processor_rank(processor_name, gpu_name, rankings_dict):
    """
    Get processor rank from the pre-loaded rankings dictionary.
    Returns rank (1-208) or a default value if not found.
    """
    if pd.isna(processor_name) or not isinstance(processor_name, str):
        print(f"Invalid processor name: {processor_name} - Using default rank: {len(rankings_dict) + 1}")
        return len(rankings_dict) + 1

    # Clean the processor name for lookup
    cleaned_name = processor_name.lower().strip()
    
    # First try exact match
    if cleaned_name in rankings_dict:
        processor_info = rankings_dict[cleaned_name]
        print(f"Found exact match: {processor_name} - Rank: {processor_info['rank']}")
        return processor_info['rank']
    
    # If no exact match, try partial matching
    for key, info in rankings_dict.items():
        # For "Other" company processors, only match processor name
        if info['company'] == 'other':
            if info['processor_name'] in cleaned_name or cleaned_name in info['processor_name']:
                print(f"Found partial match (Other): {processor_name} -> {key} - Rank: {info['rank']}")
                return info['rank']
        else:
            # For other companies, match full name
            if cleaned_name in key or key in cleaned_name:
                print(f"Found partial match: {processor_name} -> {key} - Rank: {info['rank']}")
                return info['rank']
    
    # If no match found, try to find similar processors
    similar_processors = []
    
    # First check if it's an "Other" company processor
    other_series_patterns = ['a', 'apple', 'helio', 'dimensity', 'tensor', 'exynos', 'kirin', 'unisoc', 'tiger', 't']
    for pattern in other_series_patterns:
        if pattern in cleaned_name:
            # Find all processors in the "Other" category with this pattern
            for key, info in rankings_dict.items():
                if info['company'] == 'other' and pattern in info['processor_name']:
                    similar_processors.append(info['rank'])
            if similar_processors:
                median_rank = int(np.median(similar_processors))
                print(f"Found similar 'Other' processor: {processor_name} - Using median rank: {median_rank}")
                return median_rank
    
    # If not an "Other" processor, try regular company matching
    company = None
    series = None
    
    # Try to identify company
    for comp in ['mediatek', 'qualcomm', 'apple', 'samsung', 'google', 'intel', 'amd']:
        if comp in cleaned_name:
            company = comp
            break
    
    # Try to identify series
    series_patterns = {
        'mediatek': ['helio', 'dimensity', 'g', 'p'],
        'qualcomm': ['snapdragon', 'gen', 's'],
        'apple': ['a', 'm'],
        'samsung': ['exynos'],
        'google': ['tensor'],
        'intel': ['core', 'celeron', 'pentium'],
        'amd': ['ryzen']
    }
    
    if company and company in series_patterns:
        for pattern in series_patterns[company]:
            if pattern in cleaned_name:
                series = pattern
                break
    
    # Find similar processors based on company and series
    if company and series:
        for key, info in rankings_dict.items():
            if company in key.lower() and series in key.lower():
                similar_processors.append(info['rank'])
    
    # If we found similar processors, use their median rank
    if similar_processors:
        median_rank = int(np.median(similar_processors))
        print(f"Processor not found: {processor_name} - Using median rank of similar processors: {median_rank}")
        return median_rank
    
    # If no similar processors found, use default rank
    print(f"Processor not found and no similar processors: {processor_name} - Using default rank: {len(rankings_dict) + 1}")
    return len(rankings_dict) + 1

def calculate_performance_score(row, rankings_dict):
    """
    Calculate performance score based on processor ranking and hardware specifications.
    Returns a score between 0-100.
    """
    # Get processor ranking using the loaded dictionary
    processor_name = row.get('chipset')
    gpu_name = row.get('gpu')
    processor_rank = get_processor_rank(processor_name, gpu_name, rankings_dict)

    # Define the maximum possible rank based on your CSV + 1 for not found
    max_possible_rank = len(rankings_dict) + 1

    # Convert processor rank to score (higher rank = lower score)
    if processor_rank > len(rankings_dict):
        processor_score = 0
    else:
        processor_score = 100 - ((processor_rank - 1) / (len(rankings_dict)) * 100)
        processor_score = max(0, processor_score)

    # Rest of the scoring logic remains the same...
    # CPU score
    cpu_score = 0
    cpu_str = row.get('cpu')
    if pd.notna(cpu_str) and isinstance(cpu_str, str):
        cpu_str = cpu_str.lower()
        if 'octa' in cpu_str:
            cpu_score = 80
        elif 'hexa' in cpu_str:
            cpu_score = 60
        elif 'quad' in cpu_str:
            cpu_score = 40
        elif 'dual' in cpu_str:
            cpu_score = 20
        else:
            numbers = re.findall(r'\d+', cpu_str)
            if numbers:
                cpu_score = min(int(numbers[0]) * 10, 100)

    # GPU score
    gpu_score = 50
    if pd.notna(gpu_name) and isinstance(gpu_name, str):
        if 'Adreno' in gpu_name:
            gpu_score = 80
        elif 'Mali' in gpu_name:
            gpu_score = 70
        elif 'PowerVR' in gpu_name:
            gpu_score = 60

    # RAM score
    ram_score = 0
    ram_gb = row.get('ram_gb')
    ram_type = row.get('ram_type')
    if pd.notna(ram_gb):
        ram_score = min(ram_gb * 10, 100)
        if pd.notna(ram_type) and isinstance(ram_type, str):
            if 'LPDDR5' in ram_type:
                ram_score *= 1.2
            elif 'LPDDR4X' in ram_type:
                ram_score *= 1.1

    # Storage score
    storage_score = 0
    storage_gb = row.get('storage_gb')
    storage_type = row.get('storage_type')
    if pd.notna(storage_gb):
        storage_score = min(storage_gb * 0.5, 100)
        if pd.notna(storage_type) and isinstance(storage_type, str):
            if 'UFS 3.1' in storage_type:
                storage_score *= 1.3
            elif 'UFS 3.0' in storage_type:
                storage_score *= 1.2
            elif 'UFS 2.2' in storage_type:
                storage_score *= 1.1

    # Calculate final score with weights
    final_score = (
        processor_score * 0.4 +
        cpu_score * 0.2 +
        gpu_score * 0.2 +
        ram_score * 0.1 +
        storage_score * 0.1
    )

    return round(final_score, 2)

# Apply the function to the DataFrame
if processor_rankings:
    df['performance_score'] = df.apply(lambda row: calculate_performance_score(row, processor_rankings), axis=1)
else:
    print("Skipping performance score calculation due to missing processor rankings.")
    df['performance_score'] = np.nan

Successfully loaded 208 processor rankings from processor_ratings.csv
Found exact match: Mediatek Dimensity 9400e - Rank: 5
Found similar 'Other' processor: Mediatek Helio G200 - Using median rank: 50
Found similar 'Other' processor: Qualcomm SM7750-AB Snapdragon 7 Gen 4 - Using median rank: 50
Found exact match: Mediatek Dimensity 9400 Plus - Rank: 1
Found exact match: Mediatek Dimensity 7300 - Rank: 67
Found partial match (Other): Spreadtrum UniSoC SC9863A -> unisoc sc9863a - Rank: 202
Found exact match: Mediatek Dimensity 7300 - Rank: 67
Found exact match: Mediatek Dimensity 6300 - Rank: 105
Found exact match: Mediatek Dimensity 9400e - Rank: 5
Found similar 'Other' processor: Qualcomm SM8750-3-AB Snapdragon 8 Elite - Using median rank: 50
Found exact match: Mediatek Dimensity 9400e - Rank: 5
Found similar 'Other' processor: Qualcomm SM8750-AB Snapdragon 8 Elite - Using median rank: 50
Found similar 'Other' processor: Qualcomm SM8735 Snapdragon 8s Gen 4 - Using median rank: 50
Found

In [24]:
df[['name', 'chipset', 'gpu','ram', 'performance_score']].head()

Unnamed: 0,name,chipset,gpu,ram,performance_score
0,Realme Neo7 Turbo,Mediatek Dimensity 9400e,Immortalis-G720 MC12,12 GB,85.23
1,Tecno Spark 40,Mediatek Helio G200,Mali-G57 MC2,8 GB,75.62
2,Vivo S30,Qualcomm SM7750-AB Snapdragon 7 Gen 4,Adreno 722,12 GB,83.58
3,OnePlus Ace 5 Ultra,Mediatek Dimensity 9400 Plus,Immortalis-G925 MC12,12 GB,88.0
4,Motorola Moto G86 Power,Mediatek Dimensity 7300,Mali-G615 MC2,8 GB,75.31


Security-based Features:
Connectivity-based Features:
Brand-based Features:
Release-based Features:
Composite Features:

##### Security-based Features

In [29]:
def calculate_security_score(row):
    """
    Calculate security score based on available security features.
    Returns a score between 0-100.
    """
    security_score = 0
    max_score = 100
    weights = {
        'biometric': 0.4,      # 40% weight for biometric security
        'os_security': 0.3,    # 30% weight for OS security
        'build_security': 0.3  # 30% weight for physical security
    }
    
    # 1. Biometric Security Score (40%)
    biometric_score = 0
    
    # Fingerprint sensor
    if pd.notna(row.get('fingerprint_sensor')) and row['fingerprint_sensor']:
        biometric_score += 20
        # Additional points for fingerprint sensor type
        if pd.notna(row.get('finger_sensor_type')):
            sensor_type = str(row['finger_sensor_type']).lower()
            if 'ultrasonic' in sensor_type:
                biometric_score += 10  # Ultrasonic is most secure
            elif 'optical' in sensor_type:
                biometric_score += 8   # Optical is second best
            elif 'in display' in sensor_type:
                biometric_score += 7   # In-display is third best
            elif 'side' in sensor_type:
                biometric_score += 5   # Side-mounted is basic
    
    # Face unlock
    if pd.notna(row.get('face_unlock')) and row['face_unlock']:
        biometric_score += 10
    
    # 2. OS Security Score (30%)
    os_score = 0
    
    # OS Type and Version
    if pd.notna(row.get('operating_system')):
        os = str(row['operating_system']).lower()
        
        # Modern OS scoring
        if 'ios' in os:
            os_score += 15  # iOS has best security
        elif 'harmonyos' in os:
            os_score += 12  # HarmonyOS is modern and secure
        elif 'android' in os:
            os_score += 10  # Android is secure but less than iOS
            # Try to extract version for Android
            try:
                version = float(''.join(filter(str.isdigit, os)))
                os_score += min((version / 14) * 5, 5)  # Additional points for newer versions
            except:
                pass
        elif 'flyme' in os or 'flyme aios' in os:
            os_score += 8   # Flyme is a custom Android with security features
        elif 'tizen' in os:
            os_score += 7   # Tizen is relatively secure
        else:
            os_score += 5   # Basic score for other modern OS
    
    # 3. Physical Security Score (30%)
    physical_score = 0
    
    # IP Rating scoring
    if pd.notna(row.get('ip_rating')):
        ip_rating = str(row['ip_rating']).lower()
        
        # Highest protection
        if any(x in ip_rating for x in ['ip68/ip69', 'ip69', 'ip68/ip69k', 'ip66/ip68/ip69']):
            physical_score += 15
        # High protection
        elif any(x in ip_rating for x in ['ip68', 'ip66/ip68', 'ip65/ip68']):
            physical_score += 12
        # Medium-high protection
        elif any(x in ip_rating for x in ['ip67', 'ip66', 'ip65']):
            physical_score += 10
        # Medium protection
        elif any(x in ip_rating for x in ['ip64', 'ip54', 'ip53']):
            physical_score += 8
        # Basic protection
        elif any(x in ip_rating for x in ['ip52', 'ip50', 'ip48']):
            physical_score += 5
    
    # Ruggedness scoring
    if pd.notna(row.get('ruggedness')):
        ruggedness = str(row['ruggedness']).lower()
        if 'dust proof' in ruggedness or 'dustproof' in ruggedness:
            physical_score += 15
        elif 'dust resistant' in ruggedness:
            physical_score += 10
        elif 'ip53' in ruggedness:  # Some entries have IP53 in ruggedness
            physical_score += 8
    
    # Calculate final weighted score
    final_score = (
        biometric_score * weights['biometric'] +
        os_score * weights['os_security'] +
        physical_score * weights['build_security']
    )
    
    return round(final_score, 2)

# Apply the function to the DataFrame
df['security_score'] = df.apply(calculate_security_score, axis=1)

In [28]:
finger_types = df['finger_sensor_type'].unique().tolist()
os = df['operating_system'].unique().tolist()
ip = df['ip_rating'].unique().tolist()
rug = df['ruggedness'].unique().tolist()

print('finger sensor types:',finger_types)
print('Operating systems:',os)
print('ip ratings:',ip)
print('ruggedness',rug)


finger sensor types: ['Optical', nan, 'Ultrasonic', 'Side-Mounted', 'Side-mounted', 'In Display']
Operating systems: ['Android', 'Feature phones', 'Feature Phone', 'HarmonyOS', 'Flyme', 'FeaturePhone', 'iOS', 'SkyUI', 'Flyme AIOS', nan, 'RTOS OS', 'S30+', 'KaiOS', 'KAI OS', 'Android 10', 'Android v10 (Q)', 'Android v10', 'Java', 'Java Me', 'Feature phone', 'Microsoft Windows', 'Microsoft Windows Phone', 'Windows Phone', 'Tizen', 'Firefox OS', 'Windows', 'Nokia Asha', 'Symbian', 'Symbian Belle', 'TouchWiz UI', 'Symbian^3', 'Maemo', 'Microsoft Windows Mobile', 'Symbian OS', 'Symbian OS v9.2']
ip ratings: ['IP68/IP69', 'IP64', nan, 'IP50', 'IP69', 'IP68', 'IP65/IP68', 'IP65', 'IP54', 'IP67', 'IP68/IP69K', 'IP66/IP68', 'IP48', 'IP53', 'IP66', 'IP66/IP68/IP69', 'IPX8', 'IP65M', 'IPX8/IPX9', 'IP52', 'IPX2', 'IP68, IP69K', 'IP65, IP68', 'IP69/IP68', 'IP68, IP69', 'IPX4', 'IPX4, IP5X', 'IPX5, IPX8, IP6X', 'IP5X, IPX4', 'IPX4/IP5X', 'IPX4, IPX5', 'IP86', 'IPX7', 'IP58', 'IP57']
ruggedness ['Dus

In [30]:
df[['name','fingerprint_sensor','finger_sensor_type', 'operating_system', 'ip_rating' ,'security_score']].head()

Unnamed: 0,name,fingerprint_sensor,finger_sensor_type,operating_system,ip_rating,security_score
0,Realme Neo7 Turbo,Yes,Optical,Android,IP68/IP69,27.2
1,Tecno Spark 40,Yes,,Android,IP64,21.9
2,Vivo S30,Yes,Optical,Android,,18.2
3,OnePlus Ace 5 Ultra,Yes,Optical,Android,,18.2
4,Motorola Moto G86 Power,Yes,Optical,Android,IP68/IP69,27.2


##### Connectivity-based Features

In [33]:
def calculate_connectivity_score(row):
    """
    Calculate connectivity score based on various connectivity features.
    Returns a score between 0-100.
    """
    connectivity_score = 0
    max_score = 100
    weights = {
        'network': 0.3,      # 30% weight for network connectivity
        'wireless': 0.3,     # 30% weight for wireless connectivity
        'ports': 0.2,        # 20% weight for physical ports
        'sim': 0.2          # 20% weight for SIM capabilities
    }
    
    # 1. Network Connectivity Score (30%)
    network_score = 0
    
    # Network type and speed
    if pd.notna(row.get('speed')):
        speed = str(row['speed']).lower()
        
        # 5G scoring
        if '5g' in speed:
            network_score += 15
            # Additional points for 5G speed
            if '7.5' in speed or '7.5 gbps' in speed:
                network_score += 5
            elif '5+' in speed or '5 gbps' in speed:
                network_score += 4
            elif '4.5' in speed or '4.4' in speed or '4 gbps' in speed:
                network_score += 3
            elif '3.3' in speed or '3.6' in speed or '3.7' in speed:
                network_score += 2
            elif '2.3' in speed or '2.4' in speed or '2+' in speed:
                network_score += 1
        
        # LTE scoring
        if 'lte' in speed:
            network_score += 10
            # Additional points for LTE-A and carrier aggregation
            if 'lte-a' in speed:
                network_score += 5
            if 'ca' in speed:
                ca_count = 0
                if '7ca' in speed:
                    ca_count = 7
                elif '6ca' in speed:
                    ca_count = 6
                elif '5ca' in speed:
                    ca_count = 5
                elif '4ca' in speed:
                    ca_count = 4
                elif '3ca' in speed:
                    ca_count = 3
                elif '2ca' in speed:
                    ca_count = 2
                network_score += min(ca_count * 2, 10)
    
    # 2. Wireless Connectivity Score (30%)
    wireless_score = 0
    
    # WLAN scoring
    if pd.notna(row.get('wlan')):
        wlan = str(row['wlan']).lower()
        
        # WiFi version scoring
        if 'wifi 8' in wlan:
            wireless_score += 15
        elif 'wifi 7' in wlan:
            wireless_score += 14
        elif 'wifi 6e' in wlan:
            wireless_score += 12
        elif 'wifi 6' in wlan:
            wireless_score += 10
        elif 'wifi 5' in wlan:
            wireless_score += 8
        elif 'wifi 4' in wlan:
            wireless_score += 5
        elif 'wifi 3' in wlan:
            wireless_score += 3
        elif 'wifi 2' in wlan:
            wireless_score += 2
        
        # Additional features
        if 'mimo' in wlan:
            wireless_score += 5
        if '6ghz' in wlan:
            wireless_score += 3
        if '5ghz' in wlan:
            wireless_score += 2
        if 'dual-band' in wlan:
            wireless_score += 2
    
    # Bluetooth scoring
    if pd.notna(row.get('bluetooth')):
        bluetooth = str(row['bluetooth']).lower()
        
        # Version scoring
        if 'v6.0' in bluetooth or '6.0' in bluetooth:
            wireless_score += 10
        elif 'v5.4' in bluetooth or '5.4' in bluetooth:
            wireless_score += 9
        elif 'v5.3' in bluetooth or '5.3' in bluetooth:
            wireless_score += 8
        elif 'v5.2' in bluetooth or '5.2' in bluetooth:
            wireless_score += 7
        elif 'v5.1' in bluetooth or '5.1' in bluetooth:
            wireless_score += 6
        elif 'v5.0' in bluetooth or '5.0' in bluetooth:
            wireless_score += 5
        elif 'v4.2' in bluetooth or '4.2' in bluetooth:
            wireless_score += 4
        elif 'v4.1' in bluetooth or '4.1' in bluetooth:
            wireless_score += 3
        elif 'v4.0' in bluetooth or '4.0' in bluetooth:
            wireless_score += 2
        
        # Additional features
        if 'aptx' in bluetooth:
            wireless_score += 2
        if 'le' in bluetooth:
            wireless_score += 1
    
    # 3. Physical Ports Score (20%)
    ports_score = 0
    
    # USB
    if pd.notna(row.get('usb')):
        usb = str(row['usb']).lower()
        if 'usb-c' in usb or 'usb c' in usb:
            ports_score += 10
        elif 'usb' in usb:
            ports_score += 5
    
    # USB OTG
    if pd.notna(row.get('usb_otg')) and row['usb_otg']:
        ports_score += 5
    
    # 4. SIM Capabilities Score (20%)
    sim_score = 0
    
    # SIM Slot
    if pd.notna(row.get('sim_slot')):
        sim = str(row['sim_slot']).lower()
        if 'dual' in sim:
            sim_score += 10
        elif 'single' in sim:
            sim_score += 5
    
    # VoLTE
    if pd.notna(row.get('volte')) and row['volte']:
        sim_score += 10
    
    # Calculate final weighted score
    final_score = (
        network_score * weights['network'] +
        wireless_score * weights['wireless'] +
        ports_score * weights['ports'] +
        sim_score * weights['sim']
    )
    
    return round(final_score, 2)

# Apply the function to the DataFrame
df['connectivity_score'] = df.apply(calculate_connectivity_score, axis=1)

In [31]:
wifi = df['wlan'].unique().tolist()
speed = df['speed'].unique().tolist()
bluetooth = df['bluetooth'].unique().tolist()
usb = df['usb'].unique().tolist()

print('wlan types:',wifi)
print('speed types:',speed)
print('bluetooth types:',bluetooth)
print('usb',usb)

wlan types: ['Wi-Fi 7 (802.11 a/b/g/n/ac/be/ax) 5GHz 6GHz, MIMO', 'Wi-Fi 5 (802.11 a/b/g/n/ac) 5GHz', 'Wi-Fi 6 (802.11 a/b/g/n/ac/ax) 5GHz, MIMO', 'Wi-Fi 4 (802.11 b/g/n)', nan, 'Wi-Fi 6E (802.11 a/b/g/n/ac/ax) 5GHz 6GHz, MIMO', 'Wi-Fi 6 (802.11 a/b/g/n/ac/ax) 5GHz', 'Wi-Fi 5 (802.11 a/b/g/n/ac)', 'Wi-Fi 7 (802.11 a/b/g/n/ac/be/ax) 5GHz 6GHz', 'Wi-Fi 6E (802.11 a/b/g/n/ac/ax) 5GHz 6GHz', 'Wi-Fi 4 (802.11 a/b/g/n)', 'Wi-Fi 5 (802.11 a/b/g/n/ac) 5GHz, MIMO', 'Wi-Fi 6E (802.11 a/b/g/n/ax) 5GHz 6GHz', 'Wi-Fi 5 (802.11 b/g/n/ac)', 'Wi-Fi 6 (802.11 a/b/g/n/ac/ax) 5GHz 6GHz, MIMO', 'Wi-Fi 4 (802.11 a/b/g/n/ac)', 'Wi-Fi 5 (802.11 b/g/n/ac) 5GHz', 'Wi-Fi 4 (802.11 b/g/n), MIMO', 'Wi-Fi 6 (802.11 b/g/n/ac) 5GHz', 'Wi-Fi 802.11 a/b/g/n/ac, dual-band', 'Wi-Fi 5 (802.11 a/b/e/g/n/ac) 5GHz, MIMO', 'Wi-Fi 6E (802.11 a/b/g/n/ac/ax) 6GHz, MIMO', 'Wi-Fi 5 (802.11 b/g/n)', 'Wi-Fi 5 (802.11 a/b/g/n/ac/6e) 5GHz', 'Wi-Fi 4 (802.11 b/g/n) 5GHz', 'Wi-Fi 5 (802.11 a/b/n/ac)', 'Wi-Fi 7 (802.11 a/b/g/n/x/ac/be/a

In [34]:
df[['name', 'connectivity_score']].head()

Unnamed: 0,name,connectivity_score
0,Realme Neo7 Turbo,20.2
1,Tecno Spark 40,12.7
2,Vivo S30,19.3
3,OnePlus Ace 5 Ultra,20.2
4,Motorola Moto G86 Power,18.1


##### Brand-based Features

In [35]:
brand = df['brand'].unique().tolist()

print('brand list', brand)

brand list ['Realme', 'Tecno', 'Vivo', 'OnePlus', 'Motorola', 'Walton', 'Acer', 'Samsung', 'Sony', 'iQOO', 'Lava', 'Symphony', 'Itel', 'Infinix', 'Honor', 'ZTE', 'Xiaomi', 'Proton', 'Oppo', 'Doogee', 'Huawei', 'Meizu', 'Alcatel', 'TCL', 'Nothing', 'Oscal', 'Benco', 'Coolpad', 'Ulefone', 'Google', 'HMD', 'Helio', 'HTC', 'Nokia', 'Energizer', 'Apple', 'Asus', 'Umidigi', 'Thuraya', 'Sonim', 'Sharp', 'Philips', 'Blackview', 'Oukitel', 'Cubot', 'DOOGEE', 'Nio', 'Geo', 'Hallo', 'Wiko', 'PROTON', 'FreeYond', '5star', 'Maximus', 'Leica', 'Lenovo', 'Bengal', 'LG', 'Leitz', nan, 'maximus', 'TECNO', 'vivo', 'UMIDIGI', 'We', 'WE', 'Panasonic', 'Micromax', 'Allview', 'WE X2', 'Cat', 'XTRA', 'Mycell', 'BlackBerry', 'Okapia', 'Oneplus', 'Kingstar', 'LAVA', 'Microsoft', 'Gionee', 'OnePlus 2', 'Celkon', 'Maxis']
