# LOAD AND INSPECT DATA

In [1]:
import pandas as pd

# Load the dataset
file_path = 'nirf_data_sets.csv'  # Ensure the path is correct
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Display the structure of the dataset
print("\nStructure of the dataset:")
print(data.info())

# Display basic statistics of the dataset
print("\nBasic statistics of the dataset:")
print(data.describe(include='all'))


First few rows of the dataset:
   ranking_year ranking_category  institute_id  \
0          2023           Dental   IR-N-I-1441   
1          2023           Dental   IR-N-C-7254   
2          2023           Dental   IR-N-I-1110   
3          2023           Dental  IR-N-C-28507   
4          2023           Dental  IR-N-C-19320   

                                      institute_name       city        state  \
0  Saveetha Institute of Medical and Technical Sc...    Chennai   Tamil Nadu   
1        Manipal College of Dental Sciences, Manipal    Manipal    Karnataka   
2                         Dr. D. Y. Patil Vidyapeeth       Pune  Maharashtra   
3          Maulana Azad Institute of Dental Sciences      Delhi        Delhi   
4   A.B.Shetty Memorial Institute of Dental Sciences  Mangaluru    Karnataka   

   score  rank  
0  84.08     1  
1  77.51     2  
2  73.08     3  
3  70.96     4  
4  69.21     5  

Structure of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5413 ent

# CHECKING THE MISSING VALUES 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the dataset
file_path = 'nirf_data_sets.csv'
data = pd.read_csv(file_path)

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values before handling:")
print(missing_values)

# Handle missing values
data = data.dropna(subset=['score'])

# Check for missing values after handling
missing_values_after = data.isnull().sum()
print("\nMissing values after handling:")
print(missing_values_after)


Missing values before handling:
ranking_year         0
ranking_category     0
institute_id         0
institute_name       0
city                 0
state                0
score               10
rank                 0
dtype: int64

Missing values after handling:
ranking_year        0
ranking_category    0
institute_id        0
institute_name      0
city                0
state               0
score               0
rank                0
dtype: int64


# Descriptive Statistics


# HANDLING MISSING VALUES 

In [3]:
data_cleaned = data.dropna(subset=['score'])


In [4]:
# Handle missing values by dropping rows with missing score values
data = data.dropna(subset=['score'])


In [5]:
# Get descriptive statistics of the dataset, including both numerical and categorical features
desc_stats = data.describe(include='all')
print(desc_stats)


        ranking_year ranking_category institute_id             institute_name  \
count    5403.000000             5403         5403                       5403   
unique           NaN               12         2336                       1440   
top              NaN      Engineering  IR-O-U-0196  Aligarh Muslim University   
freq             NaN             1200           10                         48   
mean     2020.123820              NaN          NaN                        NaN   
std         2.113065              NaN          NaN                        NaN   
min      2016.000000              NaN          NaN                        NaN   
25%      2019.000000              NaN          NaN                        NaN   
50%      2020.000000              NaN          NaN                        NaN   
75%      2022.000000              NaN          NaN                        NaN   
max      2023.000000              NaN          NaN                        NaN   

           city       state

# Convert Categorical Variables

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# # Load the dataset
# file_path = 'nirf_data_sets.csv'
# data = pd.read_csv(file_path)

# Initialize a dictionary to store label encoders for each column
label_encoders = {}

# Categorical columns to encode, including institute_id
categorical_columns = ['ranking_category', 'institute_name', 'city', 'state',  'institute_id']

# Encode categorical variables
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Save the label encoders for future use
with open('label_encoders.pkl', 'wb') as le_file:
    pickle.dump(label_encoders, le_file)

print("Label encoders saved successfully.")

# Verify the encoded data
print(data.head())


Label encoders saved successfully.
   ranking_year  ranking_category  institute_id  institute_name  city  state  \
0          2023                 3          1211            1138    70     27   
1          2023                 3          1208             804   255     14   
2          2023                 3          1210             275   319     17   
3          2023                 3          1186             826    83      7   
4          2023                 3          1184               1   254     14   

   score  rank  
0  84.08     1  
1  77.51     2  
2  73.08     3  
3  70.96     4  
4  69.21     5  


# DATA VISUALIZATION

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load the dataset
file_path = 'nirf_data_sets.csv'
data = pd.read_csv(file_path)

# Distribution of Ranks
plt.figure(figsize=(10, 6))
sns.histplot(data['rank'], bins=30, kde=True)
plt.title('Distribution of Ranks')
plt.xlabel('Rank')
plt.ylabel('Frequency')
plt.savefig('static/images/rank_distribution.png')
plt.close()

# Distribution of Scores
plt.figure(figsize=(10, 6))
sns.histplot(data['score'], bins=30, kde=True)
plt.title('Distribution of Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.savefig('static/images/score_distribution.png')
plt.close()

# Select only numeric columns for the correlation heatmap
numeric_data = data.select_dtypes(include=[float, int])

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap')
plt.savefig('static/images/correlation_heatmap.png')
plt.close()

# Bar chart of average scores by ranking category
plt.figure(figsize=(12, 6))
average_scores_by_category = data.groupby('ranking_category')['score'].mean().sort_values()
sns.barplot(x=average_scores_by_category.index, y=average_scores_by_category.values)
plt.title('Average Scores by Ranking Category')
plt.xlabel('Ranking Category')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.savefig('static/images/average_scores_by_category.png')
plt.close()

# Pie chart of institutes by state
plt.figure(figsize=(12, 8))
institutes_by_state = data['state'].value_counts()
plt.pie(institutes_by_state, labels=institutes_by_state.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Institutes by State')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.savefig('static/images/institutes_by_state.png')
plt.close()


# Data Preparation for Modeling

In [8]:
print(data['score'].isnull().sum())


10


# FETURE SCALING

In [9]:
# Define the features and target variable
features = data.drop(columns=['score'])  # Drop 'score' from features, keep 'rank' as a feature
target = data['score']  # Set 'score' as the target variable


In [10]:
data = data.dropna(subset=['score'])


In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle
import joblib

# Load the dataset
# file_path = 'nirf_data_sets.csv'
# data = pd.read_csv(file_path)

# Initialize a dictionary to store label encoders for each column
label_encoders = {}

# Categorical columns to encode, including institute_id
categorical_columns = ['ranking_category', 'institute_name', 'city', 'state', 'institute_id']

# Encode categorical variables
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Save the label encoders for future use
with open('label_encoders.pkl', 'wb') as le_file:
    pickle.dump(label_encoders, le_file)
print("Label encoders saved successfully.")

# Select numerical features (adjust as per your dataset)
numerical_features = ['institute_id', 'city', 'state', 'ranking_category','rank']

# Initialize the scaler
scaler = StandardScaler()

# Standardize numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Display the first few rows to see the changes
print(data.head())

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved successfully.")


Label encoders saved successfully.
   ranking_year  ranking_category  institute_id  institute_name      city  \
0          2023         -1.039642      0.195979            1138 -1.089130   
1          2023         -1.039642      0.190208             804  0.387806   
2          2023         -1.039642      0.194055             275  0.898747   
3          2023         -1.039642      0.147891             826 -0.985345   
4          2023         -1.039642      0.144044               1  0.379823   

      state  score      rank  
0  0.910736  84.08 -1.237745  
1 -0.519862  77.51 -1.212766  
2 -0.189724  73.08 -1.187788  
3 -1.290184  70.96 -1.162809  
4 -0.519862  69.21 -1.137831  
Scaler saved successfully.


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset (assuming 'data' has already been label-encoded and standardized)
file_path = 'nirf_data_sets.csv'
data = pd.read_csv(file_path)

# Remove rows with missing target values
data = data.dropna(subset=['score'])  # Drop rows where 'score' is NaN

# Define the features and target variable
features = data.drop(columns=['score'])  # Drop only 'score' from features, keep 'rank' as a feature if needed
target = data['score']  # Set 'score' as the target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Display the shapes of the splits to confirm
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')


X_train shape: (4322, 7)
X_test shape: (1081, 7)
y_train shape: (4322,)
y_test shape: (1081,)


# Model Building, Training, and Testing

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import pickle

# Load the dataset
# file_path = 'nirf_data_sets.csv'
# data = pd.read_csv(file_path)

# Remove rows with missing target values
data = data.dropna(subset=['score'])

# Initialize a dictionary to store label encoders for each column
label_encoders = {}

# Categorical columns to encode
categorical_columns = ['ranking_category', 'institute_id', 'institute_name', 'city', 'state']

# Encode categorical variables
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Save the label encoders for future use
with open('label_encoders.pkl', 'wb') as le_file:
    pickle.dump(label_encoders, le_file)

# Select numerical features to standardize
numerical_features = ['institute_id', 'city', 'state', 'ranking_category', 'rank']

# Initialize the scaler
scaler = StandardScaler()

# Standardize numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved successfully.")

# Define the features and target variable
features = data[numerical_features]
target = data['score']  # Set 'score' as the target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model without rounding predictions
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Save the trained model for future use
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
print("Model saved successfully.")


Scaler saved successfully.
Mean Squared Error: 5.470610704525968
R^2 Score: 0.9596662295070839
Model saved successfully.


In [14]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Linear Regression - Mean Squared Error: {mse}')
print(f'Linear Regression - R^2 Score: {r2}')


Linear Regression - Mean Squared Error: 49.45520285000096
Linear Regression - R^2 Score: 0.6353762113282391


In [15]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the model
model = DecisionTreeRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Decision Tree Regression - Mean Squared Error: {mse}')
print(f'Decision Tree Regression - R^2 Score: {r2}')


Decision Tree Regression - Mean Squared Error: 8.335681468316375
Decision Tree Regression - R^2 Score: 0.9385426086036478


In [16]:
# SUPORT VECTOR METHOD

from sklearn.svm import SVR 

# Initialize the model
model = SVR(kernel='linear')

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Support Vector Regression - Mean Squared Error: {mse}')
print(f'Support Vector Regression - R^2 Score: {r2}')


Support Vector Regression - Mean Squared Error: 52.06418223535649
Support Vector Regression - R^2 Score: 0.6161407033688406


In [17]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Gradient Boosting Regression - Mean Squared Error: {mse}')
print(f'Gradient Boosting Regression - R^2 Score: {r2}')


Gradient Boosting Regression - Mean Squared Error: 5.664113558178169
Gradient Boosting Regression - R^2 Score: 0.9582395698322371


In [18]:
from sklearn.neighbors import KNeighborsRegressor

# Initialize the model
model = KNeighborsRegressor(n_neighbors=5)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'K-Nearest Neighbors Regression - Mean Squared Error: {mse}')
print(f'K-Nearest Neighbors Regression - R^2 Score: {r2}')


K-Nearest Neighbors Regression - Mean Squared Error: 18.199037404329324
K-Nearest Neighbors Regression - R^2 Score: 0.8658219644013547


In [19]:
from sklearn.linear_model import Ridge

# Initialize the model
model = Ridge(alpha=1.0)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Ridge Regression - Mean Squared Error: {mse}')
print(f'Ridge Regression - R^2 Score: {r2}')


Ridge Regression - Mean Squared Error: 49.45799376809207
Ridge Regression - R^2 Score: 0.6353556344208642


In [20]:
from sklearn.linear_model import Lasso

# Initialize the model
model = Lasso(alpha=0.1)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Lasso Regression - Mean Squared Error: {mse}')
print(f'Lasso Regression - R^2 Score: {r2}')


Lasso Regression - Mean Squared Error: 49.750785126982976
Lasso Regression - R^2 Score: 0.6331969395128084
