In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### DATA LOADING

In [3]:
# Load data
df_train = pd.read_csv(r'C:\Users\USER\Desktop\sales pred\Train.csv')
df_test = pd.read_csv(r'C:\Users\USER\Desktop\sales pred\Test.csv')
print("Train columns:", df_train.columns.tolist())

Train columns: ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Location', 'Outlet_Type', 'Item_Outlet_Sales']


### EVALUATION FUNCTION

In [4]:
def evaluate_model(true_values, predicted_values):
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    mae = mean_absolute_error(true_values, predicted_values)
    r2 = r2_score(true_values, predicted_values)
    return mae, rmse, r2

### DATA CLEANING AND HANDLING MISSING VALUES

In [13]:
# Define columns
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Type']
numerical_cols = ['Item_Weight', 'Item_MRP', 'Outlet_Establishment_Year']

# Check NaNs
print("NaN counts in df_train before cleaning:")
print(df_train.isna().sum())

# Clean Item_Fat_Content
df_train['Item_Fat_Content'] = df_train['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'})

# Fill Item_Weight by Item_Type mean, then global mean if still NaN
df_train['Item_Weight'] = df_train.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))
df_train['Item_Weight'] = df_train['Item_Weight'].fillna(df_train['Item_Weight'].mean())

# Drop rows where target is NaN
df_train = df_train.dropna(subset=['Item_Outlet_Sales'])

# Verify no NaNs in key columns
print("NaN counts after cleaning:")
print(df_train[numerical_cols + ['Item_Outlet_Sales']].isna().sum())

NaN counts in df_train before cleaning:
Item_Weight                  0
Item_Fat_Content             0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Location              0
Outlet_Type                  0
Item_Outlet_Sales            0
Outlet_Age                   0
Is_High_Fat                  0
Item_MRP_per_kg              1
Item_MRP_log                 0
Item_MRP_per_kg_log          1
Item_Outlet_Sales_log        0
dtype: int64
NaN counts after cleaning:
Item_Weight                  0
Item_MRP                     0
Outlet_Establishment_Year    0
Item_Outlet_Sales            0
dtype: int64


### FEATURE ENGINEERING

In [14]:
# New features
df_train['Outlet_Age'] = 2025 - df_train['Outlet_Establishment_Year']
df_train['Is_High_Fat'] = df_train['Item_Fat_Content'].apply(lambda x: 1 if x == 'Regular' else 0)
df_train['Item_MRP_per_kg'] = df_train['Item_MRP'] / df_train['Item_Weight']

# Replace inf/nan in Item_MRP_per_kg
df_train['Item_MRP_per_kg'] = df_train['Item_MRP_per_kg'].replace([np.inf, -np.inf], np.nan)
df_train['Item_MRP_per_kg'] = df_train['Item_MRP_per_kg'].fillna(df_train['Item_MRP_per_kg'].mean())

# Update numerical columns
numerical_cols = ['Item_Weight', 'Item_MRP', 'Outlet_Age', 'Item_MRP_per_kg']
binary_cols = ['Is_High_Fat']

# Check for NaNs
print("NaN counts in new features:")
print(df_train[numerical_cols + binary_cols].isna().sum())

NaN counts in new features:
Item_Weight        0
Item_MRP           0
Outlet_Age         0
Item_MRP_per_kg    0
Is_High_Fat        0
dtype: int64


### LOG TRANSFORM SKEWED FEATURES AND TARGET

In [15]:
# Log transform skewed features and target
df_train['Item_MRP_log'] = np.log1p(df_train['Item_MRP'])
df_train['Item_MRP_per_kg_log'] = np.log1p(df_train['Item_MRP_per_kg'])
df_train['Item_Outlet_Sales_log'] = np.log1p(df_train['Item_Outlet_Sales'])

# Update numerical columns for scaling
numerical_cols_log = ['Item_Weight', 'Item_MRP_log', 'Outlet_Age', 'Item_MRP_per_kg_log']

# Debug
print("Sample log-transformed target:", df_train['Item_Outlet_Sales_log'].head())

Sample log-transformed target: 0    16.432124
1    14.301108
2    15.854976
3    14.802883
4    15.109030
Name: Item_Outlet_Sales_log, dtype: float64


### ONE-HOT ENCODING CATEGORICAL VARIABLES

In [16]:
# One-hot encode categorical variables
df_train_encoded = pd.get_dummies(df_train[categorical_cols], columns=categorical_cols, drop_first=True)

# Combine with numerical and binary features
X = pd.concat([df_train[numerical_cols_log + binary_cols], df_train_encoded], axis=1)
y = df_train['Item_Outlet_Sales_log']  # Use log-transformed target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debug
print("X_train columns:", X_train.columns.tolist())
print("Sample y_train values (log):", y_train[:5].values)

X_train columns: ['Item_Weight', 'Item_MRP_log', 'Outlet_Age', 'Item_MRP_per_kg_log', 'Is_High_Fat', 'Item_Fat_Content_Regular', 'Item_Type_Breads', 'Item_Type_Breakfast', 'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods', 'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks', 'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat', 'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods', 'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods']
Sample y_train values (log): [15.94304452 11.95870044 14.16800518 16.77487657 16.27285432]


### FEATURE SCALING

In [17]:
# Scale only the numerical features
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[numerical_cols_log] = scaler.fit_transform(X_train[numerical_cols_log])
X_test_scaled[numerical_cols_log] = scaler.transform(X_test[numerical_cols_log])

# Check for NaNs after scaling
print("NaN counts in X_train_scaled:")
print(X_train_scaled.isna().sum())
print("NaN counts in X_test_scaled:")
print(X_test_scaled.isna().sum())

NaN counts in X_train_scaled:
Item_Weight                        0
Item_MRP_log                       0
Outlet_Age                         0
Item_MRP_per_kg_log                0
Is_High_Fat                        0
Item_Fat_Content_Regular           0
Item_Type_Breads                   0
Item_Type_Breakfast                0
Item_Type_Canned                   0
Item_Type_Dairy                    0
Item_Type_Frozen Foods             0
Item_Type_Fruits and Vegetables    0
Item_Type_Hard Drinks              0
Item_Type_Health and Hygiene       0
Item_Type_Household                0
Item_Type_Meat                     0
Item_Type_Others                   0
Item_Type_Seafood                  0
Item_Type_Snack Foods              0
Item_Type_Soft Drinks              0
Item_Type_Starchy Foods            0
dtype: int64
NaN counts in X_test_scaled:
Item_Weight                        0
Item_MRP_log                       0
Outlet_Age                         0
Item_MRP_per_kg_log                0
Is_

### TRAINING LINEAR REGRESSION MODEL

In [18]:
# Train the model on log-transformed target
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predictions (in log space)
y_train_pred_log = model.predict(X_train_scaled)
y_test_pred_log = model.predict(X_test_scaled)

### INVERSE TRANSFORMATION AND EVALUATION

In [19]:
# Inverse transform predictions and true values to original UGX scale
y_train_pred = np.expm1(y_train_pred_log)  # expm1 reverses log1p
y_test_pred = np.expm1(y_test_pred_log)
y_train_orig = np.expm1(y_train)
y_test_orig = np.expm1(y_test)

# Debug
print("Sample y_train_pred (UGX):", y_train_pred[:5])
print("Sample y_test_pred (UGX):", y_test_pred[:5])

# Evaluate on original scale
train_mae, train_rmse, train_r2 = evaluate_model(y_train_orig, y_train_pred)
test_mae, test_rmse, test_r2 = evaluate_model(y_test_orig, y_test_pred)

Sample y_train_pred (UGX): [ 3772320.11528229  1426978.03965616  1081410.74138163 14751609.31916894
 11862618.9767784 ]
Sample y_test_pred (UGX): [10491109.05216125  5752694.8639348   8608805.80375234  1914894.26672928
  6645587.38417434]


In [20]:
# Print results
print('Linear Regression with Improvements')
print('Model performance for Training set')
print(f'- Root Mean Squared Error: {train_rmse:.4f} UGX')
print(f'- Mean Absolute Error: {train_mae:.4f} UGX')
print(f'- R2 Score: {train_r2:.4f}')
print('----------------------------------')
print('Model performance for Test set')
print(f'- Root Mean Squared Error: {test_rmse:.4f} UGX')
print(f'- Mean Absolute Error: {test_mae:.4f} UGX')
print(f'- R2 Score: {test_r2:.4f}')
print('='*35)

# Save results
file_path = r'C:\Users\USER\Desktop\sales pred\notebook\linear_regression_results.txt'
with open(file_path, 'w') as f:
    f.write('Linear Regression with Improvements\n')
    f.write('Model performance for Training set\n')
    f.write(f'- Root Mean Squared Error: {train_rmse:.4f} UGX\n')
    f.write(f'- Mean Absolute Error: {train_mae:.4f} UGX\n')
    f.write(f'- R2 Score: {train_r2:.4f}\n')
    f.write('----------------------------------\n')
    f.write('Model performance for Test set\n')
    f.write(f'- Root Mean Squared Error: {test_rmse:.4f} UGX\n')
    f.write(f'- Mean Absolute Error: {test_mae:.4f} UGX\n')
    f.write(f'- R2 Score: {test_r2:.4f}\n')
    f.write('='*35 + '\n')
print(f'Results saved to {file_path}')

Linear Regression with Improvements
Model performance for Training set
- Root Mean Squared Error: 5096638.2828 UGX
- Mean Absolute Error: 3633812.9552 UGX
- R2 Score: 0.3366
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5639155.7250 UGX
- Mean Absolute Error: 4263280.0465 UGX
- R2 Score: 0.3283
Results saved to C:\Users\USER\Desktop\sales pred\notebook\linear_regression_results.txt
