Train Data Cleaning

In [25]:
import pandas as pd

df = pd.read_csv("train.csv")
#identify missing values in the columns
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

#remove rows with missing values

df = df.dropna()
print("Data after dropping rows with missing values:")
print(df)
#data after dropping rows with missing values
df = df.dropna()
print("Data after dropping columns with missing values:")
print(df.head())

#print the number of duplicate values
duplicates = df.duplicated()
print("Number of duplicate rows:", duplicates.sum())

#save the cleaned file
df.to_csv("cleaned_dataset.csv", index=False)


Missing values in each column:
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64
Data after dropping rows with missing values:
     Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0              FDA15        9.300          Low Fat         0.016047   
1              DRC01        5.920          Regular         0.019278   
2              FDN15       17.500          Low Fat         0.016760   
4              NCD19        8.930          Low Fat         0.000000   
5              FDP36       10.395          Regular         0.000000   
...              ...          ...              ...              ...   
8517 

Encode Cleaned Train Data

In [31]:
# Import necessary libraries
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

train_encode = pd.read_csv('cleaned_dataset.csv')
train_encode = pd.DataFrame(train_encode)

# Separate features and target
X = train_encode[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Type']]
y = train_encode['Item_Outlet_Sales']

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # `drop='first'` removes the first category to avoid multicollinearity

# Fit and transform the features
X_encoded = encoder.fit_transform(X)

# Convert to DataFrame and add column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

# Concatenate encoded features with target variable
df_encoded = pd.concat([encoded_df, y], axis=1)

#print(df_encoded)

# Save the encoded dataframe to a CSV file
df_encoded.to_csv("encoded_train_data.csv", index=False)


Model Training (Linear Regression)

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score  # Import necessary metrics
import joblib

# Load and prepare data
data = pd.read_csv('encoded_train_data.csv')

# Specify the name of the target column
target_column = 'Item_Outlet_Sales' # Replace with your actual target column name

# Separate features and target
X = data.drop(columns=[target_column])  # Features (independent variables)
y = data[target_column]  # Target (dependent variable)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model with the entire dataset
model.fit(X, y)

# Make predictions on the same data (since we're not splitting)
y_pred = model.predict(X)

# Evaluate the model using regression metrics
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y, y_pred)
# Calculate R-squared (R2)
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Save the trained model
joblib.dump(model, 'model_1')





Clean and Encode Test Dataset

In [34]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
#Load the train dataset
test = pd.read_csv('test.csv')

#Identify missing values in the columns

missing_values = test.isnull().sum()

#remove rows with missing values
test = test.dropna()


# Separate features and target
X = test_encode[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Type']]
y = test_encode[['Item_Identifier','Outlet_Identifier']]

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # `drop='first'` removes the first category to avoid multicollinearity

# Fit and transform the features
X_encoded = encoder.fit_transform(X)

# Convert to DataFrame and add column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

# Concatenate encoded features with target variable
df_encoded = pd.concat([encoded_df, y], axis=1)

#print(df_encoded)

# Save the encoded dataframe to a CSV file
df_encoded.to_csv("encoded_test_data.csv", index=False)


Merge Train and Test Dataset

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Merge datasets on 'PassengerId' to add the 'Survived' column to the test dataset
merged_data = test_data.merge(train_data[['Item_Identifier', 'Item_Outlet_Sales']], on='Item_Identifier', how='left')

# Save the updated test dataset with the 'Survived' column
merged_data.to_csv('train_test_merge.csv', index=False)


Encode Merge Dataset

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

df = pd.read_csv("train_test_merge.csv")

df = pd.DataFrame(df)

# Separate features and target
X = train_encode[['Item_Fat_Content', 'Item_Type', 'Outlet_Type', 'Outlet_Size']]
y = train_encode[['Item_Identifier',  'Outlet_Identifier', 'Item_Outlet_Sales']] # change to list of columns

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # `drop='first'` removes the first category to avoid multicollinearity

# Fit and transform the features
X_encoded = encoder.fit_transform(X)

# Convert to DataFrame and add column names
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))

# Concatenate encoded features with target variable
df_encoded = pd.concat([encoded_df, y], axis=1)

#print(df_encoded)

# Save the encoded dataframe to a CSV file
df_encoded.to_csv("encode_merge_dataset.csv", index=False)


Predict the new Price

In [None]:
import pandas as pd
import joblib

# Load the model
model = joblib.load('model_1')

# Load the training data to get the original feature names
training_data = pd.read_csv('encoded_train_data.csv')  # Replace with the path to your encoded training data
training_features = training_data.drop(columns=['Item_Outlet_Sales']).columns  # Get the feature names from the training data

# Load the test data
test_data = pd.read_csv('encoded_test_data.csv')

# Specify the name of the target column (if it exists in test data, remove it)
target_column = 'Item_Outlet_Sales'  # Update to the actual target column

# **Store 'Item_Identifier' and 'Outlet_Identifier' before dropping**
index_ids = test_data['Item_Identifier'].tolist()
index_ids2 = test_data['Outlet_Identifier'].tolist()

# Drop the target column if it exists in test data
if target_column in test_data.columns:
    test_data = test_data.drop(columns=[target_column])

# *** Drop 'Item_Identifier' and 'Outlet_Identifier' as they are not part of the model's features ***
test_data = test_data.drop(columns=['Item_Identifier', 'Outlet_Identifier'])

# Align the test data columns with the training data
# Use 'training_features' as the reference for column order
# and fill missing columns with 0
test_data = test_data.reindex(columns=training_features, fill_value=0)


# Make predictions for each row in the test data
predictions = model.predict(test_data)  # Use the preprocessed test data

# Create a DataFrame with Item_Identifier, Outlet_Identifier, and Item_Outlet_Sales columns
results = pd.DataFrame({
    'Item_Identifier': index_ids,
    'Outlet_Identifier': index_ids2,
    'Item_Outlet_Sales': predictions
})

# Display the results
print(results)

# Optionally, save the results to a CSV file
results.to_csv('predictions.csv', index=False)


In [None]:
import pandas as pd
import joblib

# Load the model
model = joblib.load('model_1')

# Load the training data to get the original feature names
training_data = pd.read_csv('encoded_train_data.csv')  # Replace with the path to your encoded training data
training_features = training_data.drop(columns=['Item_Outlet_Sales']).columns  # Get the feature names from the training data

# Load the test data
test_data = pd.read_csv('encoded_test_data.csv')

# Specify the name of the target column (if it exists in test data, remove it)
target_column = 'Item_Identifier', 'Outlet_Type'  # Replace with your actual target column name, if needed

# **Store 'Item_Identifier' before reindexing**
index_ids = test_data['Item_Identifier'].tolist()

# **Check if 'Outlet_Type' exists in test_data before accessing it**
if 'Outlet_Type' in test_data.columns:
    index_ids2 = test_data['Outlet_Type'].tolist()
else:
    # Handle the case where 'Outlet_Type' is missing
    # You might want to:
    # 1. Load the original test data to get 'Outlet_Type'
    # 2. Create a placeholder list for index_ids2
    # 3. Raise an error indicating the missing column
    original_test_data = pd.read_csv('test.csv')  # Assuming your original test data is in 'test.csv'
    index_ids2 = original_test_data['Outlet_Type'].tolist()
    # Or create a placeholder:
    # index_ids2 = ['Unknown'] * len(test_data)
    # Or raise an error:
    # raise KeyError("Column 'Outlet_Type' not found in test data.")


# Remove the target column if it exists in test data
if target_column in test_data.columns:
    test_data = test_data.drop(columns=[target_column])

# *** Instead of dropping, keep only columns used for training ***
# This ensures that test_data has the same columns as training_data
test_data = test_data[training_features]

# Make predictions for each row in the test data
predictions = model.predict(test_data)

# Create a DataFrame with Item_Identifier and Item_Outlet_Sales columns
results = pd.DataFrame({
    'Item_Identifier': index_ids,
    'Outlet_Type': index_ids2,
    'Item_Outlet_Sales': predictions
})

# Display the results
print(results)

# Optionally, save the results to a CSV file
results.to_csv('predictions.csv', index=False)