In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df=pd.read_csv("indian-national-level-election.csv")

label_encoder = LabelEncoder()

df['st_name'] = label_encoder.fit_transform(df['st_name'])
df['partyabbre'] = label_encoder.fit_transform(df['partyabbre'])
df['cand_sex'] = label_encoder.fit_transform(df['cand_sex'])

most_frequent_train_gender = df['cand_sex'].mode()[0]
df['cand_sex'] = df['cand_sex'].fillna(most_frequent_train_gender)

attributes_to_scale = [ 'electors']

min_max_scaler = MinMaxScaler()
data_normalized = df.copy()
df[attributes_to_scale] = min_max_scaler.fit_transform(data_normalized[attributes_to_scale])


df=df.drop(columns=['pc_name','pc_type','cand_name','partyname'])

x=df.drop(columns=["totvotpoll"])
y=df['totvotpoll']

selector = SelectKBest(score_func=f_regression, k=6)
selector.fit(x, y)
selected_features = x.columns[selector.get_support()]

X = df[selected_features]  
y = df['totvotpoll']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)


print("Mean Squared Error  : ", round(mse,2))
print("R-squared Value     : ", r_squared)
print("Accuracy Percentage : ", round(100*r_squared,2,),"%")


Mean Squared Error  :  1504170913.24
R-squared Value     :  0.8628755444779936
Accuracy Percentage :  86.29 %
