### Load Data

In [256]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 

In [257]:
df=pd.read_csv("./dataset/engineering.csv")
df.sample(6)

Unnamed: 0,institute_id,name,tlr,rpc,go,oi,perception,city,state,rank
77,IR-E-U-0201,Shri Mata Vaishno Devi University,69.41,18.63,41.79,48.0,1.1,Katra,Jammu and Kashmir,78
81,IR-E-C-36926,Kumaraguru College of Technology,63.11,11.35,54.41,45.37,14.79,Coimbatore,Tamil Nadu,82
123,IR-E-C-25622,Chaitanya Bharathi Institute of Technology,57.82,2.01,57.52,50.22,8.5,Hyderabad,Telangana,124
96,IR-E-C-18154,University College of Engineering,58.25,10.41,59.54,50.97,1.63,Kakinada,Andhra Pradesh,97
125,IR-E-C-1413,Dayananda Sagar College of Engineering,51.56,6.69,57.99,54.63,5.69,Bengaluru,Karnataka,126
100,IR-E-C-27058,Mepco Schlenk Engineering College,49.72,18.31,52.51,47.18,18.86,Sivakasi,Tamil Nadu,101


In [258]:
df.shape

(200, 10)

In [259]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   institute_id  200 non-null    object 
 1   name          200 non-null    object 
 2   tlr           200 non-null    float64
 3   rpc           200 non-null    float64
 4   go            200 non-null    float64
 5   oi            200 non-null    float64
 6   perception    200 non-null    float64
 7   city          200 non-null    object 
 8   state         200 non-null    object 
 9   rank          200 non-null    int64  
dtypes: float64(5), int64(1), object(4)
memory usage: 15.8+ KB


In [260]:
df.isnull().sum()

institute_id    0
name            0
tlr             0
rpc             0
go              0
oi              0
perception      0
city            0
state           0
rank            0
dtype: int64

In [261]:
df.describe()

Unnamed: 0,tlr,rpc,go,oi,perception,rank
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,59.05015,20.30735,56.7036,51.8737,14.21465,100.475
std,10.152778,19.937026,11.10281,6.853437,19.262625,57.866936
min,35.51,0.46,13.06,33.8,0.0,1.0
25%,52.535,5.3975,50.1125,47.36,2.5575,50.75
50%,57.525,13.35,55.07,51.855,6.65,100.5
75%,64.3925,30.6425,63.095,56.0025,17.79,150.25
max,95.42,96.15,89.65,75.7,100.0,200.0


In [262]:
df.duplicated().sum()

0

In [263]:
clean_df=df.drop(["institute_id","name","city","state"],axis=True)
clean_df.sample(6)

Unnamed: 0,tlr,rpc,go,oi,perception,rank
34,71.41,41.69,59.89,54.4,8.95,35
165,59.15,2.31,46.82,42.34,6.65,166
48,69.35,19.73,52.14,48.69,51.77,49
125,51.56,6.69,57.99,54.63,5.69,126
183,54.06,1.24,50.93,47.58,0.0,184
20,69.57,47.62,63.17,47.92,45.56,21


### Split Data

In [264]:
X = clean_df.drop('rank', axis=1)
y = clean_df['rank']

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (200, 5)
Shape of y =  (200,)


In [265]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (160, 5)
Shape of y_train =  (160,)
Shape of X_test =  (40, 5)
Shape of y_test =  (40,)


### Model Traning with Support Vector Regressor

In [266]:
from sklearn.svm import SVR

In [267]:
#Model Traning with ernel='linear'
svr_linear = SVR(kernel='linear')
svr_linear.fit(X_train, y_train)
svr_linear.score(X_test, y_test)

0.8024276602878435

In [268]:
y_pred=svr_linear.predict(X_test)

In [269]:
from sklearn.metrics import mean_squared_error

In [270]:
#first finding mean square error using mean_squared_error class
mse = mean_squared_error(y_test, y_pred)
#finding Root mean square error using pandas
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  691.1691682557217
RMSE =  26.2900963911455


### Model Traning with Random Forest Regressor

In [271]:
from sklearn.ensemble import RandomForestRegressor

In [272]:
regressorRFR = RandomForestRegressor(n_estimators=100, criterion='squared_error')
regressorRFR.fit(X_train, y_train)

In [273]:
regressorRFR.score(X_test, y_test)

0.933668287270905

In [274]:
y_pred2=regressorRFR.predict(X_test)

In [275]:
#first finding mean square error using mean_squared_error class
mse = mean_squared_error(y_test, y_pred2)
#finding Root mean square error using pandas
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  232.0488525
RMSE =  15.233149789193304


In [276]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(regressorRFR, X_train, y_train, cv=5, ).mean()

0.9191807328831466

In [277]:
int(regressorRFR.predict([X_test.iloc[18, :]])[0].round())



118

In [278]:
y_test.iloc[18]

129

### Save the Model

In [279]:
import joblib
joblib.dump(regressorRFR, "college_rank_predictor.pkl")

['college_rank_predictor.pkl']

In [280]:
model = joblib.load("college_rank_predictor.pkl")

In [281]:
model.predict([X_test.iloc[18, :]])[0]



117.84

In [282]:
feature_importances = model.feature_importances_
feature_names = X.columns

# Combine feature names with their importance scores
feature_importance = dict(zip(feature_names, feature_importances))

# Sort features based on importance
sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance
for feature, importance in sorted_feature_importance:
    print(f'{feature}: {importance}')


rpc: 0.5670756505633826
perception: 0.17501247980687615
tlr: 0.15071849288255307
go: 0.08039300703143659
oi: 0.026800369715751667


In [283]:
# Assuming df and df_new are your DataFrames for different years
df_2016=pd.read_csv("./db/2016/EngineeringRanking_2016.csv")
df_2017=pd.read_csv("./db/2017/EngineeringRanking_2017.csv")
df_2018=pd.read_csv("./db/2018/EngineeringRanking_2018.csv")
df_2019=pd.read_csv("./db/2019/EngineeringRanking_2019.csv")
df_2020=pd.read_csv("./db/2020/EngineeringRanking_2020.csv")
df_2021=pd.read_csv("./db/2021/EngineeringRanking_2021.csv")


df_2016['year'] = 1
df_2016['year'] = 2
df_2017['year'] = 3
df_2018['year'] = 4
df_2019['year'] = 5
df_2020['year'] = 6
df_2021['year'] = 7



df_combined = pd.concat([df_2016,df_2017,df_2018,df_2019,df_2020,df_2021], ignore_index=True)
excel_file_path_combined = 'combined_data.xlsx'

# Save the combined DataFrame to Excel
# df_combined.to_excel(excel_file_path_combined, index=False)

# Save the combined DataFrame to CSV
csv_file_path_combined = 'combined_data.csv'
df_combined.to_csv(csv_file_path_combined, index=False)

print(f"Combined DataFrame has been saved to {csv_file_path_combined}")

print(df_combined.columns)

clean_df=df_combined.drop(["Institute Id","Institute Name","City","State"],axis=True)
clean_df.sample(6)

Combined DataFrame has been saved to combined_data.csv
Index(['Institute Id', 'Institute Name', 'City', 'State', 'Score', 'Rank',
       'TLR', 'RPC', 'GO', 'OI', 'Perception', 'year'],
      dtype='object')


Unnamed: 0,Score,Rank,TLR,RPC,GO,OI,Perception,year
500,89.93,1,95.42,94.64,83.9,61.31,100.0,6
545,48.66,46,61.63,35.41,62.3,51.68,19.21,6
447,32.31,148,55.99,4.59,43.66,51.55,2.41,5
217,56.89,18,81.45,39.77,64.08,59.02,17.99,4
34,63.39,33,40.32,72.19,95.08,66.33,61.0,2
452,32.2,153,50.09,4.24,55.5,42.62,5.44,5


In [284]:



X = clean_df.drop('Rank', axis=1)
y = clean_df['Rank']

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)


model_combined = RandomForestRegressor(n_estimators=100, random_state=42)
model_combined.fit(X_train, y_train)
# Assuming X_test and y_test are your test data
print(X_train)


Shape of X =  (900, 7)
Shape of y =  (900,)
Shape of X_train =  (720, 7)
Shape of y_train =  (720,)
Shape of X_test =  (180, 7)
Shape of y_test =  (180,)


     Score    TLR    RPC     GO     OI  Perception  year
152  42.73  61.17  18.43  59.99  58.30       10.17     3
411  35.02  61.36   2.75  54.70  48.52        0.00     5
364  40.51  54.81  14.26  61.96  55.76       18.19     5
327  52.69  63.47  28.22  77.01  61.09       36.71     5
597  37.73  45.32  22.62  60.94  45.84        5.69     6
..     ...    ...    ...    ...    ...         ...   ...
528  54.45  65.95  47.65  65.69  54.82       17.42     6
709  64.19  73.15  50.17  78.62  64.54       50.18     7
736  53.95  70.69  43.21  62.04  57.79       15.86     7
485  30.79  40.58   1.26  61.61  57.60        1.63     5
57   57.47  73.56  51.98  75.30  38.57       33.00     2

[720 rows x 7 columns]


In [285]:
predictions_combined = model_combined.predict(X_test)

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

mae_combined = mean_absolute_error(y_test, predictions_combined)
mse_combined = mean_squared_error(y_test, predictions_combined)
r2_combined = r2_score(y_test, predictions_combined)

print(f'Combined Data Mean Absolute Error: {mae_combined}')
print(f'Combined Data Mean Squared Error: {mse_combined}')
print(f'Combined Data R-squared: {r2_combined}')

Combined Data Mean Absolute Error: 1.6833333333333327
Combined Data Mean Squared Error: 6.413225555555555
Combined Data R-squared: 0.997892377440574


In [286]:
import joblib
joblib.dump(regressorRFR, "college_rank_predictor1.pkl")
model = joblib.load("college_rank_predictor.pkl1")


FileNotFoundError: [Errno 2] No such file or directory: 'college_rank_predictor.pkl1'