#Data preparation

In [1]:
import pandas as pd
import numpy as np
#Obtaining the datasets
train = pd.read_json('train.json')
final_test=pd.read_json('test.json')
# Some quick visualizations
print(train.head())
print(final_test.head())

      id      cuisine                                        ingredients
0  10259        greek  [romaine lettuce, black olives, grape tomatoes...
1  25693  southern_us  [plain flour, ground pepper, salt, tomatoes, g...
2  20130     filipino  [eggs, pepper, salt, mayonaise, cooking oil, g...
3  22213       indian                [water, vegetable oil, wheat, salt]
4  13162       indian  [black pepper, shallots, cornflour, cayenne pe...
      id                                        ingredients
0  18009  [baking powder, eggs, all-purpose flour, raisi...
1  28583  [sugar, egg yolks, corn starch, cream of tarta...
2  41580  [sausage links, fennel bulb, fronds, olive oil...
3  29752  [meat cuts, file powder, smoked sausage, okra,...
4  35687  [ground black pepper, salt, sausage casings, l...


In [2]:
# Training data preparation
from sklearn.model_selection import train_test_split
#converting the column into a string for facilitation
train['ingredients']=train['ingredients'].astype(str)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(train['ingredients'], train['cuisine'], test_size=0.2, random_state=42)
#quick visualization
print(X_train)
print(X_test)

23436    ['shredded cheddar cheese', 'chicken meat', 'c...
7901     ['fresh cilantro', 'purple onion', 'ground cor...
25718    ['sugar', 'garlic', 'onions', 'vinegar', 'gree...
16909    ['raw pistachios', 'purple onion', 'couscous',...
34830    ['tomatoes', 'pepper', 'salsa', 'sliced green ...
                               ...                        
6265     ['tomato purée', 'butter', 'salt', 'taco seaso...
11284    ['marsala wine', 'butter', 'olive oil', 'fresh...
38158    ['blue crabs', 'peeled fresh ginger', 'soy sau...
860      ['pepper', 'spicy brown mustard', 'boneless ch...
15795    ['olive oil', 'thyme leaves', 'dried lavender'...
Name: ingredients, Length: 31819, dtype: object
21513    ['pork', 'cooking oil', 'bamboo shoots', 'chin...
1796     ['hog casings', 'hungarian paprika', 'ancho po...
21861    ['lamb stock', 'lemon', 'lamb shoulder', 'onio...
26571    ['green peas', 'cinnamon sticks', 'clove', 'ch...
28720    ['vegetable oil spray', 'cumin seed', 'grated ...
        

In [3]:
#Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
#Vectorizer
vectorizer = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
 use_idf=True, smooth_idf=True)
#Training data vectorization
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
#Problem vectorization
final_test_vec=vectorizer.transform(final_test['ingredients'].astype(str))

#Training

In [4]:
#Some models
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.svm import LinearSVC as SVM
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import classification_report as CR
from sklearn.naive_bayes import BernoulliNB as NB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import Perceptron as PT
#initial models configuration
svm=SVM(penalty='l2',max_iter=500,C=1,random_state=42)
svc=SVC(max_iter=500,C=1,random_state=42)
sgd=SGD(loss='hinge',penalty='l2',max_iter=500,random_state=42)
lr=LR(max_iter=500)
nb=NB(alpha=.1)
rfc=RFC(n_estimators=10,random_state=42)
pt=PT(tol=1e-3, random_state=42)
#array of models
models=[svm,sgd,lr, nb, rfc, pt]
#array of results
pred=[]

In [5]:
#Training with each model
def fit(x_train,y_train,x_test,y_test):
  for model in models:
    model.fit(x_train,y_train)
    print(model)
    prediction=(model.predict(x_test))
    print(CR(y_test,prediction))
    pred.append(prediction)
#function execution
fit(X_train_vec,y_train,X_test_vec,y_test)

LinearSVC(C=1, max_iter=500, random_state=42)
              precision    recall  f1-score   support

   brazilian       0.77      0.61      0.68        84
     british       0.66      0.44      0.53       157
cajun_creole       0.77      0.67      0.72       328
     chinese       0.78      0.85      0.81       510
    filipino       0.71      0.59      0.64       136
      french       0.63      0.66      0.64       550
       greek       0.73      0.71      0.72       249
      indian       0.88      0.89      0.88       602
       irish       0.60      0.46      0.52       151
     italian       0.82      0.89      0.85      1567
    jamaican       0.82      0.69      0.75        91
    japanese       0.82      0.73      0.77       284
      korean       0.83      0.77      0.80       166
     mexican       0.90      0.93      0.92      1336
    moroccan       0.87      0.76      0.81       166
     russian       0.55      0.51      0.53        89
 southern_us       0.71      0.77  

In [6]:
#Reconfiguring LinearSVC as it showed to be the best option
svm=SVM(penalty='l2',max_iter=2000,C=1,random_state=42,tol=1e-6)
svm.fit(X_train_vec,y_train)
y_pred=(svm.predict(X_test_vec))
print(CR(y_test,y_pred))

              precision    recall  f1-score   support

   brazilian       0.77      0.61      0.68        84
     british       0.66      0.44      0.53       157
cajun_creole       0.77      0.67      0.72       328
     chinese       0.78      0.85      0.81       510
    filipino       0.71      0.59      0.64       136
      french       0.63      0.66      0.64       550
       greek       0.73      0.71      0.72       249
      indian       0.88      0.89      0.88       602
       irish       0.60      0.46      0.52       151
     italian       0.82      0.89      0.85      1567
    jamaican       0.82      0.69      0.75        91
    japanese       0.82      0.73      0.77       284
      korean       0.83      0.77      0.80       166
     mexican       0.90      0.93      0.92      1336
    moroccan       0.87      0.76      0.81       166
     russian       0.55      0.51      0.53        89
 southern_us       0.71      0.77      0.74       848
     spanish       0.64    

#Execution

In [7]:
#prediction of the actual test data with linearSVC
y_pred=(svm.predict(final_test_vec))
#transforming into dataframe to facilitate csv creation
result_df=final_test.drop(columns=['ingredients'])
result_df['cuisine']=y_pred
print(result_df)

         id       cuisine
0     18009   southern_us
1     28583   southern_us
2     41580       italian
3     29752  cajun_creole
4     35687       italian
...     ...           ...
9939  30246        french
9940  36028      filipino
9941  22339       italian
9942  42525   southern_us
9943   1443       mexican

[9944 rows x 2 columns]


#CSV creation

In [8]:
from google.colab import files
#csv creation and download
file_name_csv = "submission_Jairo_Acevedo_17_svm.csv"  # Nombre del archivo CSV de salida
result_df.to_csv(file_name_csv, index=False)  # index=False para omitir la columna de índices
files.download(file_name_csv)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>