In [1]:
# SLNP
import pandas as pd 

In [2]:
df = pd.DataFrame(pd.read_csv(r"D:\Packagepred\notebooks\data\Placements_Dataset.csv"))
df.head()

Unnamed: 0,Name of Student,Roll No.,No. of DSA questions,CGPA,Knows ML,Knows DSA,Knows Python,Knows JavaScript,Knows HTML,Knows CSS,Knows Cricket,Knows Dance,Participated in College Fest,Was in Coding Club,No. of backlogs,Interview Room Temperature,Age of Candidate,Branch of Engineering,Placement Package
0,Todd Pope,30678,151,8.52,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,Yes,2,24.2,24,Computer Science,20.01
1,Sandra Brown,49191,24,1.23,Yes,No,No,Yes,No,No,Yes,No,No,Yes,1,20.5,18,Computer Science,10.97
2,Mrs. Amanda Singleton,83519,333,9.85,No,Yes,Yes,No,Yes,No,No,No,No,No,1,21.6,25,Mechanical Engineering,7.51
3,Matthew Alvarado,56203,132,1.96,No,No,Yes,No,Yes,No,No,No,Yes,No,4,21.2,20,Computer Science,4.96
4,Christine Smith,82173,198,9.73,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,1,20.7,21,Electrical Engineering,46.3


In [4]:
df.drop(columns='Name of Student', axis = 1, inplace = True)
df.drop(columns='Roll No.', axis = 1, inplace = True)
df.drop(columns='Branch of Engineering', axis = 1, inplace = True)

In [5]:
df.drop(columns=['Interview Room Temperature','Age of Candidate','Knows Cricket','Knows Dance','Participated in College Fest','Knows HTML','Knows CSS'],inplace=True)

Segregate the independent and dependent features

In [6]:
X = df.drop(labels='Placement Package', axis = 1)

Y = df['Placement Package']
print(X) 
print(Y)

        No. of DSA questions  CGPA Knows ML Knows DSA Knows Python  \
0                        151  8.52      Yes       Yes          Yes   
1                         24  1.23      Yes        No           No   
2                        333  9.85       No       Yes          Yes   
3                        132  1.96       No        No          Yes   
4                        198  9.73      Yes       Yes          Yes   
...                      ...   ...      ...       ...          ...   
219995                   453  8.77       No       Yes           No   
219996                   375  6.15       No        No          Yes   
219997                    52  2.05       No        No           No   
219998                   404  5.63       No        No          Yes   
219999                   134  7.49      Yes       Yes           No   

       Knows JavaScript Was in Coding Club  No. of backlogs  
0                   Yes                Yes                2  
1                   Yes            

Define ordinal which columns should be ordinal encoded and which should be scaled

In [7]:
categorical_columns = X.select_dtypes(include = 'object').columns
numerical_columns = X.select_dtypes(exclude= 'object').columns

In [8]:
categorical_columns

Index(['Knows ML', 'Knows DSA', 'Knows Python', 'Knows JavaScript',
       'Was in Coding Club'],
      dtype='object')

In [9]:
numerical_columns

Index(['No. of DSA questions', 'CGPA', 'No. of backlogs'], dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220000 entries, 0 to 219999
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   No. of DSA questions  220000 non-null  int64  
 1   CGPA                  220000 non-null  float64
 2   Knows ML              215581 non-null  object 
 3   Knows DSA             220000 non-null  object 
 4   Knows Python          215573 non-null  object 
 5   Knows JavaScript      215718 non-null  object 
 6   Was in Coding Club    220000 non-null  object 
 7   No. of backlogs       220000 non-null  int64  
 8   Placement Package     220000 non-null  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 15.1+ MB


In [11]:
ML_categories = ['No','Yes']
DSA_categories = ['No','Yes']
Python_categories = ['No','Yes']
Javascript_categories = ['No','Yes']
Club_categories = ['No','Yes']

How to work with the missing values?

We need simple imputer to fill the missing values and standard scaler to do feature scaling of numerical values

In [12]:
from sklearn.impute import SimpleImputer # Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling 
from sklearn.preprocessing import OrdinalEncoder # To rank categorical features
# Pipeline
from sklearn.pipeline import Pipeline #To add everything together 
from sklearn.compose import ColumnTransformer # Combine everything together

Creating Numerical Pipeline

In [13]:
num_pipe = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy= 'mean')), 
        ('scaler', StandardScaler())
    ]
)

cat_pipe = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')), 
        ('ordinalencoder', OrdinalEncoder(categories=[ML_categories,DSA_categories,Python_categories,Javascript_categories,Club_categories])), 
        ('scaler', StandardScaler())
    ]

)

Combine both the pipelines

In [14]:
preprocessor = ColumnTransformer([
    ('num_pipe', num_pipe, numerical_columns), 
    ('cat_pipe', cat_pipe, categorical_columns)
])

Train Test Split

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.25, random_state= 41)

In [16]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out()) 
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [17]:
X_train.head()

Unnamed: 0,num_pipe__No. of DSA questions,num_pipe__CGPA,num_pipe__No. of backlogs,cat_pipe__Knows ML,cat_pipe__Knows DSA,cat_pipe__Knows Python,cat_pipe__Knows JavaScript,cat_pipe__Was in Coding Club
0,0.684595,1.368821,1.467938,-0.977466,0.999588,-1.020575,1.020612,1.002221
1,-0.145749,-1.616734,0.881952,-0.977466,-1.000412,-1.020575,1.020612,1.002221
2,1.113606,0.464842,-0.29002,-0.977466,-1.000412,0.97984,-0.979804,1.002221
3,-0.699312,1.413847,0.881952,1.023054,0.999588,-1.020575,1.020612,-0.997784
4,-1.695724,0.700362,1.467938,1.023054,-1.000412,0.97984,1.020612,-0.997784


In [18]:
X_test.head()

Unnamed: 0,num_pipe__No. of DSA questions,num_pipe__CGPA,num_pipe__No. of backlogs,cat_pipe__Knows ML,cat_pipe__Knows DSA,cat_pipe__Knows Python,cat_pipe__Knows JavaScript,cat_pipe__Was in Coding Club
0,-0.111151,-0.809733,0.295966,-0.977466,-1.000412,0.97984,-0.979804,1.002221
1,0.8991,-0.369866,1.467938,1.023054,0.999588,0.97984,1.020612,-0.997784
2,-0.381013,-1.066034,1.467938,1.023054,-1.000412,-1.020575,1.020612,-0.997784
3,0.802227,-0.082394,0.295966,1.023054,-1.000412,0.97984,-0.979804,1.002221
4,1.687927,-0.754317,-0.876006,1.023054,0.999588,-1.020575,1.020612,-0.997784


In [19]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor

from math import sqrt
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error 
#cann't import  mean_squared_log_error because it requires scipy and numpy which are not available in the colab environment

In [20]:
regression = RandomForestRegressor()
regression.fit(X_train, y_train)

KeyboardInterrupt: 

: 

: 

In [None]:
import numpy as np
def model_evaluation(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

: 

Training multiple models 


In [None]:
models={
    'LinearRegression':LinearRegression(),
    'RandomForest' : RandomForestRegressor(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=model_evaluation(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('*'*35)
    print('\n')

: 

In [None]:
model_list

: 

In [21]:
df.to_csv("filter.csv", index=False)