### Load Data

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
df=pd.read_csv("./dataset/engineering.csv")
df.sample(6)

Unnamed: 0,institute_id,name,tlr,rpc,go,oi,perception,city,state,rank
45,IR-E-U-0055,National Institute of Technology Silchar,61.63,35.41,62.3,51.68,19.21,Silchar,Assam,46
172,IR-E-C-42242,Maharshi Karve Stree Shikshan Samstha s Cummin...,50.8,2.95,52.51,53.89,2.16,Pune,Maharashtra,173
13,IR-E-U-0439,Anna University,64.62,54.07,61.5,51.62,68.24,Chennai,Tamil Nadu,14
138,IR-E-C-18817,G H Raisoni College of Engineering,46.65,9.68,61.0,49.25,0.55,Nagpur,Maharashtra,139
28,IR-E-I-1480,Thapar Institute of Engineering Technology,65.95,47.65,65.69,54.82,17.42,Patiala,Punjab,29
162,IR-E-C-6238,Haldia Institute of Technology,49.15,6.8,50.2,55.8,4.22,Haldia,West Bengal,163


In [8]:
df.shape

(200, 10)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   institute_id  200 non-null    object 
 1   name          200 non-null    object 
 2   tlr           200 non-null    float64
 3   rpc           200 non-null    float64
 4   go            200 non-null    float64
 5   oi            200 non-null    float64
 6   perception    200 non-null    float64
 7   city          200 non-null    object 
 8   state         200 non-null    object 
 9   rank          200 non-null    int64  
dtypes: float64(5), int64(1), object(4)
memory usage: 15.8+ KB


In [10]:
df.isnull().sum()

institute_id    0
name            0
tlr             0
rpc             0
go              0
oi              0
perception      0
city            0
state           0
rank            0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,tlr,rpc,go,oi,perception,rank
count,200.0,200.0,200.0,200.0,200.0,200.0
mean,59.05015,20.30735,56.7036,51.8737,14.21465,100.475
std,10.152778,19.937026,11.10281,6.853437,19.262625,57.866936
min,35.51,0.46,13.06,33.8,0.0,1.0
25%,52.535,5.3975,50.1125,47.36,2.5575,50.75
50%,57.525,13.35,55.07,51.855,6.65,100.5
75%,64.3925,30.6425,63.095,56.0025,17.79,150.25
max,95.42,96.15,89.65,75.7,100.0,200.0


In [12]:
df.duplicated().sum()

0

In [13]:
df.corr()

ValueError: could not convert string to float: 'IR-E-U-0456'

In [None]:
df.corr()['rank']

ValueError: could not convert string to float: 'IR-E-U-0456'

In [14]:
clean_df=df.drop(["institute_id","name","city","state"],axis=True)
clean_df.sample(6)

Unnamed: 0,tlr,rpc,go,oi,perception,rank
49,61.94,21.89,67.56,51.85,29.4,50
35,63.36,31.12,73.21,53.07,36.91,36
190,43.74,7.8,51.73,52.96,2.16,191
40,61.27,41.96,63.57,61.06,11.55,41
77,69.41,18.63,41.79,48.0,1.1,78
140,57.68,1.21,54.84,51.32,1.1,141


### Split Data

In [15]:
X = clean_df.drop('rank', axis=1)
y = clean_df['rank']

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)

Shape of X =  (200, 5)
Shape of y =  (200,)


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (160, 5)
Shape of y_train =  (160,)
Shape of X_test =  (40, 5)
Shape of y_test =  (40,)


### Model Traning with Support Vector Regressor

In [17]:
from sklearn.svm import SVR

In [18]:
#Model Traning with ernel='linear'
svr_linear = SVR(kernel='linear')
svr_linear.fit(X_train, y_train)
svr_linear.score(X_test, y_test)

0.8024276602876007

In [19]:
y_pred=svr_linear.predict(X_test)

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
#first finding mean square error using mean_squared_error class
mse = mean_squared_error(y_test, y_pred)
#finding Root mean square error using pandas
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  691.1691682565713
RMSE =  26.290096391161658


### Model Traning with Random Forest Regressor

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
regressorRFR = RandomForestRegressor(n_estimators=100, criterion='squared_error')
regressorRFR.fit(X_train, y_train)

In [24]:
regressorRFR.score(X_test, y_test)

0.9343179603719296

In [25]:
y_pred2=regressorRFR.predict(X_test)

In [26]:
#first finding mean square error using mean_squared_error class
mse = mean_squared_error(y_test, y_pred2)
#finding Root mean square error using pandas
rmse = np.sqrt(mse)

print('MSE = ', mse)
print('RMSE = ', rmse)

MSE =  229.776095
RMSE =  15.15836716140627


In [27]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(regressorRFR, X_train, y_train, cv=5, ).mean()

0.9236243282380527

In [28]:
int(regressorRFR.predict([X_test.iloc[18, :]])[0].round())



115

In [29]:
y_test.iloc[18]

129

### Save the Model

In [30]:
import joblib
#joblib.dump(regressorRFR, "college_rank_predictor.pkl")

In [31]:
model = joblib.load("college_rank_predictor.pkl")

In [32]:
model.predict([X_test.iloc[18, :]])[0]



120.02

In [33]:
feature_importances = model.feature_importances_
feature_names = X.columns

# Combine feature names with their importance scores
feature_importance = dict(zip(feature_names, feature_importances))

# Sort features based on importance
sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importance
for feature, importance in sorted_feature_importance:
    print(f'{feature}: {importance}')


rpc: 0.5752502448349439
tlr: 0.16644566892482776
perception: 0.15014586558431745
go: 0.0864081935269482
oi: 0.02175002712896271


In [None]:
# Assuming df and df_new are your DataFrames for different years
df['year'] = 1
df_new['year'] = 2
df_combined = pd.concat([df, df_new], ignore_index=True)
