DATA CLEANING

In [1]:
from google.colab import files
uploaded = files.upload()



Saving Test.csv to Test.csv


In [2]:
# STEP 1: Upload file (only for Colab)
from google.colab import files
uploaded = files.upload()

# STEP 2: Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

# STEP 3: Read uploaded file
df = pd.read_csv('Test.csv')

# STEP 4: Clean and preprocess

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Standardize Item_Fat_Content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'
})

# Fill missing Item_Weight with mean
imputer = SimpleImputer(strategy='mean')
df['Item_Weight'] = imputer.fit_transform(df[['Item_Weight']])

# Fill missing Outlet_Size with mode
df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

# Create new feature: Outlet_Age
df['Outlet_Age'] = 2025 - df['Outlet_Establishment_Year']

# Label encode categorical columns
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
            'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# Normalize Item_MRP
scaler = MinMaxScaler()
df[['Item_MRP']] = scaler.fit_transform(df[['Item_MRP']])

# STEP 5: Save cleaned file
df.to_csv('Cleaned_Test.csv', index=False)

# STEP 6: Download cleaned file (for Colab)
files.download('Cleaned_Test.csv')

print("✅ Cleaning Done! File saved as 'Cleaned_Test.csv'")


Saving Test.csv to Test (1).csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Cleaning Done! File saved as 'Cleaned_Test.csv'


PRE-PROCESSING

In [3]:
# STEP 1: Upload the CSV file (Google Colab)
from google.colab import files
uploaded = files.upload()

# STEP 2: Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

# STEP 3: Read the CSV file
df = pd.read_csv('Test.csv')  # Make sure the file name matches

# STEP 4: Preview
print("Original Shape:", df.shape)
print("Missing Values:\n", df.isnull().sum())

# STEP 5: Remove duplicates
df.drop_duplicates(inplace=True)

# STEP 6: Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# STEP 7: Handle missing values
# Example: Fill numerical with mean
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

# Example: Fill categorical with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# STEP 8: Standardize specific values (example: gender, fat content)
if 'item_fat_content' in df.columns:
    df['item_fat_content'] = df['item_fat_content'].replace({
        'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'
    })

# STEP 9: Convert date columns (example)
for col in df.columns:
    if 'date' in col:
        try:
            df[col] = pd.to_datetime(df[col])
        except:
            pass

# STEP 10: Encode categorical columns
encoder = LabelEncoder()
for col in cat_cols:
    if df[col].nunique() <= 20:  # avoid high cardinality
        df[col] = encoder.fit_transform(df[col])

# STEP 11: Normalize numerical columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# STEP 12: Save cleaned file
df.to_csv('Cleaned_Test.csv', index=False)
files.download('Cleaned_Test.csv')

print("✅ Preprocessing complete! Cleaned file saved as Cleaned_Test.csv")


Saving Cleaned_Test.csv to Cleaned_Test (1).csv
Original Shape: (5681, 11)
Missing Values:
 Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Preprocessing complete! Cleaned file saved as Cleaned_Test.csv


In [12]:
# Upload file (if in Google Colab)
from google.colab import files
uploaded = files.upload()

# Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Read file
df = pd.read_csv('Test.csv')

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Print original shape and missing values
print("📂 Original Shape:", df.shape)
print("\n🔍 Missing Values (Before Cleaning):\n", df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle missing numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())

# Handle missing categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

# Fix standard category labels (example: item_fat_content)
if 'item_fat_content' in df.columns:
    df['item_fat_content'] = df['item_fat_content'].replace({
        'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'
    })

# Encode categorical features
encoder = LabelEncoder()
for col in cat_cols:
    if df[col].nunique() <= 20:  # avoid large cardinality
        df[col] = encoder.fit_transform(df[col])

# Normalize numeric columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Print cleaned missing values
print("\n✅ Missing Values (After Cleaning):\n", df.isnull().sum())

# Save cleaned file
df.to_csv('Cleaned_Test.csv', index=False)
files.download('Cleaned_Test.csv')

print("\n✅ Cleaning Done! File saved as 'Cleaned_Test.csv'")


Saving Cleaned_Test.csv to Cleaned_Test (2).csv
📂 Original Shape: (5681, 11)

🔍 Missing Values (Before Cleaning):
 item_identifier                 0
item_weight                   976
item_fat_content                0
item_visibility                 0
item_type                       0
item_mrp                        0
outlet_identifier               0
outlet_establishment_year       0
outlet_size                  1606
outlet_location_type            0
outlet_type                     0
dtype: int64

✅ Missing Values (After Cleaning):
 item_identifier              0
item_weight                  0
item_fat_content             0
item_visibility              0
item_type                    0
item_mrp                     0
outlet_identifier            0
outlet_establishment_year    0
outlet_size                  0
outlet_location_type         0
outlet_type                  0
dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Cleaning Done! File saved as 'Cleaned_Test.csv'
