In [None]:
import Data_Editing_Helpers as DEH
import Classifier as CLS
import Regressor as RGS
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, r2_score
from sklearn.ensemble import StackingRegressor, StackingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [9]:
## Loading ##
test = pd.read_csv("Data/test.csv")
train = pd.read_csv("Data/train.csv")

In [10]:
y_name = 'cuisine' # What you're trying to predict
x_name = 'id' # User id. Drop this column

# Set this to True if you want to run regression models, False for classification models
is_regression = False

In [11]:
# Display basic info about datasets
train_info = train.info()
test_info = test.info()

# Display first few rows
train_head = train.head()
test_head = test.head()

# Check for missing values
missing_values_train = train.isnull().sum()
missing_values_test = test.isnull().sum()

# Summary statistics
train_description = train.describe()
test_description = test.describe()

train_info, test_info, train_head, test_head, missing_values_train, missing_values_test, train_description, test_description


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31818 entries, 0 to 31817
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           31818 non-null  int64 
 1   cuisine      31818 non-null  object
 2   ingredients  31818 non-null  object
dtypes: int64(1), object(2)
memory usage: 745.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7955 entries, 0 to 7954
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           7955 non-null   int64 
 1   cuisine      7955 non-null   object
 2   ingredients  7955 non-null   object
dtypes: int64(1), object(2)
memory usage: 186.6+ KB


(None,
 None,
        id   cuisine                                        ingredients
 0  786437     greek  romaine lettuce, black olives, grape tomatoes,...
 1  524295  filipino  eggs, pepper, salt, mayonaise, cooking oil, gr...
 2  524306    indian                  water, vegetable oil, wheat, salt
 3  524307    indian  black pepper, shallots, cornflour, cayenne pep...
 4  524308  jamaican  plain flour, sugar, butter, eggs, fresh ginger...,
        id cuisine                                        ingredients
 0  996716       X  black peppercorns, crushed red pepper, fresh g...
 1  829945       X  eggs, zucchini, pinto beans, chorizo sausage, ...
 2  829949       X  chicken broth, unsalted butter, garlic, ground...
 3  829953       X  pistachios, carrots, sugar, raisins, cashew nu...
 4  829954       X  jack cheese, cilantro sprigs, pumpkin seeds, c...,
 id             0
 cuisine        0
 ingredients    0
 dtype: int64,
 id             0
 cuisine        0
 ingredients    0
 dtype: i

In [12]:
## Wrangling ##
#Future implementation will remove map_seasons and convert_strings_to_ascii
train, test = DEH.map_seasons(train, test)

#train = DEH.convert_strings_to_ascii(train)
#test = DEH.convert_strings_to_ascii(test)
#train, test = DEH.dropUnusedColumns(train, test, y_name, x_name)
train = DEH.remove_blank_rows(train, y_name)
train, test = DEH.fill_NA(train, test, 0)

In [13]:
# Using 99 as a filler for NA's. Will change to use Random Forest for filling NA's 
train, test = DEH.fill_NA(train, test, fill=99)
X_train, X_test, y_train, y_test = DEH.traintestslpit(train, y_name)

In [None]:


vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
train_ingredients = vectorizer.fit_transform(train['ingredients'])
test_ingredients = vectorizer.transform(test['ingredients'])

train_ingredients_df = pd.DataFrame(train_ingredients.toarray(), columns=vectorizer.get_feature_names_out())
test_ingredients_df = pd.DataFrame(test_ingredients.toarray(), columns=vectorizer.get_feature_names_out())

# Add id and cuisine columns back to train, id to test
train = pd.concat([train[['id', 'cuisine']].reset_index(drop=True), train_ingredients_df], axis=1)
test = pd.concat([test[['id']].reset_index(drop=True), test_ingredients_df], axis=1)

# Now drop the original 'ingredients' column if it still exists
if 'ingredients' in train.columns:
    train = train.drop(columns=['ingredients'])
if 'ingredients' in test.columns:
    test = test.drop(columns=['ingredients'])

# Now split into X/y for modeling
X = train.drop(columns=['id', 'cuisine'])
y = train['cuisine']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def evaluate_model(model_func, X_train, y_train, X_test, y_test, model_name, results, is_regression):
    model = model_func(X_train, y_train)
    predictions = model.predict(X_test)
    if is_regression:
        score = r2_score(y_test, predictions) * 100
    else:
        score = accuracy_score(y_test, predictions) * 100
    results.append({"model": model_name, "score": score, "model_obj": model})
    print(f"{model_name} score: {score:.3f}")

## Training Models ##
results = []

if is_regression:
    evaluate_model(RGS.decisiontreeRegressor, X_train, y_train, X_test, y_test, "Decision Tree Regressor", results, is_regression)
    evaluate_model(RGS.linearRegressor, X_train, y_train, X_test, y_test, "Linear Regressor", results, is_regression)
    evaluate_model(RGS.ridgeRegressor, X_train, y_train, X_test, y_test, "Ridge Regressor", results, is_regression)
    evaluate_model(RGS.lassoRegressor, X_train, y_train, X_test, y_test, "Lasso Regressor", results, is_regression)
    evaluate_model(RGS.randomForestRegressor, X_train, y_train, X_test, y_test, "Random Forest Regressor", results, is_regression)
    evaluate_model(RGS.gradientBoostingRegressor, X_train, y_train, X_test, y_test, "Gradient Boosting Regressor", results, is_regression)
    evaluate_model(RGS.catBoostRegressor, X_train, y_train, X_test, y_test, "Cat Boost Regressor", results, is_regression)
    evaluate_model(RGS.knnRegressor, X_train, y_train, X_test, y_test, "KNN Regressor", results, is_regression)
    evaluate_model(RGS.xgBoostRegressor, X_train, y_train, X_test, y_test, "XGBoost Regressor", results, is_regression)

else:
    evaluate_model(CLS.decisiontreeClassifier, X_train, y_train, X_test, y_test, "Decision Tree Classifier", results, is_regression)
    evaluate_model(CLS.knnClassifier, X_train, y_train, X_test, y_test, "KNN Classifier", results, is_regression)
    evaluate_model(CLS.adaboostClassifier, X_train, y_train, X_test, y_test, "AdaBoost Classifier", results, is_regression)
    evaluate_model(CLS.randomForestClassifier, X_train, y_train, X_test, y_test, "Random Forest Classifier", results, is_regression)
    evaluate_model(CLS.gradientBoostingClassifier, X_train, y_train, X_test, y_test, "Gradient Boosting Classifier", results, is_regression)
    evaluate_model(CLS.catBoostClassifier, X_train, y_train, X_test, y_test, "CatBoost Classifier", results, is_regression)
    evaluate_model(CLS.xgBoostClassifier, X_train, y_train, X_test, y_test, "XGBoost Classifier", results, is_regression)




Decision Tree Classifier Started
Best parameters Decision Tree: {'classifier__min_weight_fraction_leaf': 0.0, 'classifier__min_samples_split': 15, 'classifier__min_samples_leaf': 3, 'classifier__max_leaf_nodes': 25, 'classifier__max_depth': 5}
Model saved to ./TrainedModels/decisiontreeClassifier.pkl
Decision Tree Classifier Finished
Decision Tree Classifier score: 30.955

KNN Classifier Started


In [None]:
# Evaluate the stacking model
for model in results:
    model['cv_score'] = cross_val_score(model['model_obj'], X_train, y_train, cv=5, scoring='r2').mean()

top_3_models = sorted(results, key=lambda x: x['cv_score'], reverse=True)[:3]

# Create the stacking model
if is_regression:
    estimators = [(model['model'], model['model_obj']) for model in top_3_models]
    stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0))
else:
    estimators = [(model['model'], model['model_obj']) for model in top_3_models]
    stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Fit the stacking model
stacking_model.fit(X_train, y_train)

# Evaluate the stacking model
stacking_predictions = stacking_model.predict(X_test)
if is_regression:
    stacking_score = r2_score(y_test, stacking_predictions) * 100
else:
    stacking_score = accuracy_score(y_test, stacking_predictions) * 100

print(f"Stacking Model score: {stacking_score:.3f}")

In [None]:
# Make predictions using the stacking model
stacking_predictions_submission = stacking_model.predict(test)


In [None]:
# Print the top 3 models
print("Top 3 Models:")
for model in top_3_models:
    print(f"{model['model']}: {model['score']:.3f}")

In [None]:
# Save the predictions to a CSV file for submission
test = pd.read_csv('Data/test.csv')
submission = pd.DataFrame({
    x_name: test[x_name],
    y_name: stacking_predictions_submission
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")