In [41]:
# Data Ingestion Steps
import pandas as pd
df = pd.read_csv('data/Camp.csv')
df.head()

Unnamed: 0,Campaign_Name,Id,Status,Date,Amount,Payout,Sub_Id,Currency,Partner,Inventory,Email_Id
0,Flipkart,1122116164,pending,1/31/2025,1372.0,25.5,telegram,INR,Admitad,Flipshope,agoyal@flipshope.com
1,Flipkart,1122095405,pending,1/31/2025,231.0,5.9,app,INR,Admitad,Flipshope,agoyal@flipshope.com
2,Flipkart,1122082078,pending,1/31/2025,81064.0,48.6,store_page,INR,Admitad,Flipshope,agoyal@flipshope.com
3,Flipkart,1122062595,pending,1/31/2025,393.0,10.1,app,INR,Admitad,Flipshope,agoyal@flipshope.com
4,Flipkart,1122043961,pending,1/31/2025,214.0,5.1,app,INR,Admitad,Flipshope,agoyal@flipshope.com


In [42]:
df.drop(columns=['Id', 'Status', 'Date', 'Currency', 'Inventory', 'Email_Id'], inplace=True)
df.head()

Unnamed: 0,Campaign_Name,Amount,Payout,Sub_Id,Partner
0,Flipkart,1372.0,25.5,telegram,Admitad
1,Flipkart,231.0,5.9,app,Admitad
2,Flipkart,81064.0,48.6,store_page,Admitad
3,Flipkart,393.0,10.1,app,Admitad
4,Flipkart,214.0,5.1,app,Admitad


In [43]:
# Step 1: Strip whitespace and maybe do a simple replace
df['Campaign_Name'] = df['Campaign_Name'].str.strip()

# Optional: If there are variations like 'Puma [CPS] IN', map them manually
df['Campaign_Name'] = df['Campaign_Name'].replace({
    'Puma [CPS] IN': 'Puma',
    'Fernsnpetals': 'Fernsnpetals',
    'Myntra ': 'Myntra',
    'Flipkart ': 'Flipkart'
})
df['Campaign_Name'].value_counts().reset_index(name='counts')

Unnamed: 0,Campaign_Name,counts
0,Flipkart,4579
1,Ajio,1945
2,Myntra,1095
3,GoDaddy,60
4,Croma Retail,53
5,Firstcry,15
6,Nykaa Beauty,6
7,Puma,6
8,Boat,6
9,Fernsnpetals,3


In [44]:
X = df.drop(labels=['Payout'], axis = 1)
Y = df[['Payout']]

In [45]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [46]:
# Define the custom ranking for each ordinal variable
Campaign_categories = ['Pepperfry', 'Shopclues IN', 'Boat', 'Fernsnpetals', 'Puma', 'Firstcry', 'GoDaddy', 'Croma Retail', 'Myntra', 'Ajio', 'Flipkart']
Sub_Id_categories = ['web', 'store_page', 'telegram', 'app']
Partner_categories = ['Cuelinks', 'Admitad']

In [47]:
! pip install scikit-learn



In [48]:
from sklearn.impute import SimpleImputer ## Handling missing values
from sklearn.preprocessing import StandardScaler ## Handling Feature scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [49]:
# Numerical Pipeline
num_pipeline = Pipeline(
             steps=[
              ('imputer', SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())   
          
             ]    
) 

# Categorical Pipeline

cat_pipeline = Pipeline(
              steps=[
               ('imputer', SimpleImputer(strategy='most_frequent')),
               ('ordinal_encoder', OrdinalEncoder(categories=[
                 Campaign_categories,Sub_Id_categories, Partner_categories],
                handle_unknown='use_encoded_value',unknown_value=-1)),
               ('scaler', StandardScaler())
              ]
     
)

preprocessor = ColumnTransformer([
('num_pipeline', num_pipeline, numerical_cols),
('cat_pipline', cat_pipeline, categorical_cols)
])


In [50]:
# Train Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)


In [51]:
X_train =pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test =pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [52]:
X_train.head()

Unnamed: 0,num_pipeline__Amount,cat_pipline__Campaign_Name,cat_pipline__Sub_Id,cat_pipline__Partner
0,-0.062442,-0.406839,0.622395,-1.201601
1,-0.112567,-1.482778,0.622395,-1.201601
2,-0.128221,-1.482778,-0.631151,-1.201601
3,-0.160735,-0.406839,0.622395,-1.201601
4,-0.216128,0.6691,0.622395,0.832223


In [53]:
X_test.head()

Unnamed: 0,num_pipeline__Amount,cat_pipline__Campaign_Name,cat_pipline__Sub_Id,cat_pipline__Partner
0,-0.203484,0.6691,0.622395,0.832223
1,-0.160735,-0.406839,0.622395,-1.201601
2,0.266758,-1.482778,-0.631151,-1.201601
3,-0.13364,-1.482778,0.622395,-1.201601
4,-0.010209,-1.482778,-0.631151,-1.201601


In [54]:
# Model Training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [55]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [56]:
regression.coef_

array([[ 22.43099435,   2.8245674 ,  -3.97334382, -18.52332373]])

In [57]:
regression.intercept_

array([24.15359375])

In [58]:
import numpy as np
def evaluate_model(true, predicted):
     mae = mean_absolute_error(true, predicted)
     mse = mean_squared_error(true, predicted)
     rmse = np.sqrt(mean_squared_error(true, predicted))
     r2_square = r2_score(true, predicted)
     return mae, mse, rmse, r2_square

In [59]:
# Train multiple models
models = {
     'LinearRegression': LinearRegression(),
     'Lasso' : Lasso(),
     'Ridge' : Ridge(),
     'ElasticNet' : ElasticNet()
}

model_list = []
r2_list = []
for i in range(len(list(models))):
     model = list(models.values())[i]
     model.fit(X_train, y_train)
     
     # Make Prediction
     y_pred = model.predict(X_test)
     mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)
     print(list(models.keys())[i])
     model_list.append(list(models.keys())[i])
     print("Model Training Performance")
     print("RMSE:", rmse)
     print("MAE:", mae)
     print("R2 score:", r2_square)
     print("MSE:", mse)

LinearRegression
Model Training Performance
RMSE: 50.09628337465734
MAE: 17.313547817993904
R2 score: 0.27988257190959076
MSE: 2509.6376079539696
Lasso
Model Training Performance
RMSE: 50.229325205982036
MAE: 17.26423797222632
R2 score: 0.2760526287810653
MSE: 2522.985110648302
Ridge
Model Training Performance
RMSE: 50.096651864430584
MAE: 17.31329044994682
R2 score: 0.279871978034523
MSE: 2509.6745280259565
ElasticNet
Model Training Performance
RMSE: 51.89596008933865
MAE: 18.494530931371724
R2 score: 0.22721370803536123
MSE: 2693.1906735942302


In [60]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']