In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
data = pd.read_csv(r'training_data.csv')
label_encoder = LabelEncoder()
data['Predicted Hobby'] = label_encoder.fit_transform(data['Predicted Hobby'])
X = data.drop('Predicted Hobby', axis=1)
y = data['Predicted Hobby']
X.head()

Unnamed: 0,Olympiad_Participation,Scholarship,School,Fav_sub,Projects,Grasp_pow,Time_sprt,Medals,Career_sprt,Act_sprt,Fant_arts,Won_arts,Time_art
0,Yes,No,Yes,Mathematics,No,3,4,No,No,No,No,Maybe,1
1,No,No,Yes,Mathematics,No,4,3,No,No,Yes,Yes,No,3
2,Yes,No,Yes,Science,Yes,3,6,Yes,No,Yes,Yes,Yes,3
3,Yes,Yes,Yes,Mathematics,No,3,3,No,No,No,No,No,3
4,No,No,No,Any language,No,3,3,No,No,Yes,Yes,Yes,5


In [3]:
X.describe(include='all')

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Olympiad_Participation,Scholarship,School,Fav_sub,Projects,Grasp_pow,Time_sprt,Medals,Career_sprt,Act_sprt,Fant_arts,Won_arts,Time_art
count,1451,1451,1451,1451,1451,1451.0,1451.0,1451,1451,1451,1451,1451,1451.0
unique,2,2,2,4,2,,,2,2,2,2,3,
top,Yes,No,Yes,Mathematics,Yes,,,Yes,No,Yes,No,No,
freq,733,859,1054,632,847,,,768,904,818,822,745,
mean,,,,,,3.497588,3.048243,,,,,,2.251551
std,,,,,,0.994031,1.360685,,,,,,1.269044
min,,,,,,1.0,1.0,,,,,,1.0
25%,,,,,,3.0,2.0,,,,,,1.0
50%,,,,,,3.0,3.0,,,,,,2.0
75%,,,,,,4.0,4.0,,,,,,3.0


In [4]:
for col in X.select_dtypes(include=['category','object']).columns:
    print(col,":",X[col].nunique())

Olympiad_Participation : 2
Scholarship : 2
School : 2
Fav_sub : 4
Projects : 2
Medals : 2
Career_sprt : 2
Act_sprt : 2
Fant_arts : 2
Won_arts : 3


In [5]:
numCol = [col for col in X.select_dtypes(include=['number'])]
# labelCol = ['Won_arts']
oneCol = [col for col in X.select_dtypes(include=['category','object']) if X[col].nunique() ==2]
oneCol = oneCol+['Fav_sub','Won_arts']
print(oneCol)
print(numCol)

['Olympiad_Participation', 'Scholarship', 'School', 'Projects', 'Medals', 'Career_sprt', 'Act_sprt', 'Fant_arts', 'Fav_sub', 'Won_arts']
['Grasp_pow', 'Time_sprt', 'Time_art']


In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numCol),
        ('categorical', OneHotEncoder(), oneCol)
    ])

In [7]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [8]:
results={}
for name,model in models.items():
    pipe = Pipeline(steps=[('preprocessor',preprocessor),
                           ('model',model)])
    cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
    mean_score = np.mean(cv_scores)
    
    results[name] = mean_score
    print(f"{name} Cross-Validation Accuracy: {mean_score:.4f}")

Logistic Regression Cross-Validation Accuracy: 0.9173
Decision Tree Cross-Validation Accuracy: 0.8822
Random Forest Cross-Validation Accuracy: 0.9228
SVM Cross-Validation Accuracy: 0.9166
K-Nearest Neighbors Cross-Validation Accuracy: 0.8925


In [9]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}")


Best Model: Random Forest


In [10]:
logreg = {
    'model__C': [0.01, 0.1, 1, 10, 100],            
    'model__penalty': ['l1', 'l2'],                 
    'model__solver': ['liblinear', 'saga'],          
}
dt = {
    'model__max_depth': [3, 5, 10, 20, None],       
    'model__min_samples_split': [2, 5, 10, 20],    
    'model__min_samples_leaf': [1, 2, 5, 10],      
    'model__criterion': ['gini', 'entropy'],    
}
rf = {
    'model__n_estimators': [50, 100, 200, 500],         
    'model__max_depth': [10, 20, 30, None],             
    'model__min_samples_split': [2, 5, 10],             
    'model__min_samples_leaf': [1, 2, 4],           
    'model__max_features': ['sqrt', 'log2', None],      
}
svc = {
    'model__C': [0.1, 1, 10, 100],                 
    'model__kernel': ['linear', 'rbf', 'poly'],    
    'model__gamma': ['scale', 'auto', 0.01, 0.1],
}
knn = {
    'model__n_neighbors': [3, 5, 7, 10, 15],        
    'model__weights': ['uniform', 'distance'],      
    'model__p': [1, 2],                             
}

In [11]:
if(best_model_name=="Logistic Regression"):
    param_dist = logreg
if(best_model_name=="Decision Tree"):
    param_dist = dt
if(best_model_name=="Random Forest"):
    param_dist = rf
if(best_model_name=="SVM"):
    param_dist = svc
if(best_model_name=="K-Nearest Neighbors"):
    param_dist = knn

In [12]:
from sklearn.model_selection import RandomizedSearchCV
final_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model', best_model)
])
random_search = RandomizedSearchCV(final_pipeline, param_dist, n_iter=10, cv=5, scoring='accuracy', verbose=1, random_state=42)
random_search.fit(X, y)
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'model__n_estimators': 500, 'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__max_features': 'log2', 'model__max_depth': 30}


In [13]:
testdata = pd.read_csv(r'test_data.csv')
testdata.head()

Unnamed: 0,ID,Olympiad_Participation,Scholarship,School,Fav_sub,Projects,Grasp_pow,Time_sprt,Medals,Career_sprt,Act_sprt,Fant_arts,Won_arts,Time_art
0,0,No,No,Yes,Mathematics,No,4,2,No,No,No,Yes,No,2
1,1,Yes,Yes,Yes,Any language,Yes,2,3,Yes,No,No,No,No,2
2,2,Yes,Yes,Yes,Science,Yes,5,2,Yes,No,No,No,No,1
3,3,Yes,Yes,Yes,Science,Yes,3,3,No,No,No,Yes,Maybe,1
4,4,No,No,No,Mathematics,No,3,4,No,No,No,Yes,Yes,6


In [14]:
X_test = testdata.drop(columns=['ID'])
y_pred = best_model.predict(X_test)
predictions_df = pd.DataFrame({'ID': testdata['ID'], 'Predicted_Hobby': label_encoder.inverse_transform(y_pred)})
predictions_df.to_csv('predicted_hobbies.csv', index=False)
print("CSV file with predictions saved as 'predicted_hobbies.csv'")

CSV file with predictions saved as 'predicted_hobbies.csv'


In [15]:
import joblib
joblib.dump(final_pipeline, 'hobby_predictor_pipeline.pkl')

['hobby_predictor_pipeline.pkl']