In [1]:
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# # A function used to build a vocabulary based on descending word frequencies 
# def build_vocab(sentences):
#     # Build vocabulary
#     word_counts = Counter(itertools.chain(*sentences))
#     # Mapping from index to word
#     vocabulary_inv = [x[0] for x in word_counts.most_common()]
#     # Mapping from word to index
#     vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
#     return word_counts, vocabulary, vocabulary_inv

In [3]:
# # A function used to learn word embeddings through Word2vec module
# def get_embeddings(inp_data, vocabulary_inv, size_features=100,
#                    mode='skipgram',
#                    min_word_count=2,
#                    context=5):
#     model_name = "embedding"
#     model_name = os.path.join(model_name)
#     num_workers = 15  # Number of threads to run in parallel
#     downsampling = 1e-3  # Downsample setting for frequent words
#     print('Training Word2Vec model...')
#     # use inp_data and vocabulary_inv to reconstruct sentences
#     sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
#     if mode == 'skipgram':
#         sg = 1
#         print('Model: skip-gram')
#     elif mode == 'cbow':
#         sg = 0
#         print('Model: CBOW')
#     embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
#                                         sg=sg,
#                                         size=size_features,
#                                         min_count=min_word_count,
#                                         window=context,
#                                         sample=downsampling)
#     embedding_model.init_sims(replace=True)
#     print("Saving Word2Vec model {}".format(model_name))
#     embedding_weights = np.zeros((len(vocabulary_inv), size_features))
#     for i in range(len(vocabulary_inv)):
#         word = vocabulary_inv[i]
#         if word in embedding_model:
#             embedding_weights[i] = embedding_model[word]
#         else:
#             embedding_weights[i] = np.random.uniform(-0.25, 0.25,
#                                                      embedding_model.vector_size)
#     return embedding_weights

In [4]:
def preprocess_df(df):
    
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        # 将每个词转化为小写形式
        filtered_words = [word.lower() for word in words_list if word.lower() not in stop_words and len(word) != 1] # also skip space from above translation

#         filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [5]:
data_path = "../"

df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

df_train["text"] = df_train["review"]
df_test["text"] = df_test["review"]
df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)


In [6]:

# # tokenization 
# tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
# # build vocabulary from tokenized data
# word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
# # use the above mapping to create input data
# inp_data = [[vocabulary[word] for word in text] for text in tagged_data]
# # get embedding vector
# embedding_weights = get_embeddings(inp_data, vocabulary_inv)

In [7]:
# 分词
tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["text"])]


In [8]:
# # 引入word2vec模型
# import gensim.downloader as api
# embedding_weights = api.load('word2vec-google-news-300')

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# 词语级tf-idf  矩阵代表了每个词语在不同文档中的TF-IDF
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df_train['text'])
train_vec = tfidf_vect.transform(df_train['text'])
test_vec = tfidf_vect.transform(df_test['text'])

In [11]:
label = df_train["label"]

In [12]:
len(label)

13144

In [13]:
train_vec.shape

(13144, 5000)

In [14]:
# split train dataset and dev dataset
X_train, X_dev, y_train, y_dev = train_test_split(train_vec, label, test_size=0.2,shuffle=False)

In [15]:
from imblearn.over_sampling import SMOTE

# 定义SMOTE模型，random_state相当于随机数种子的作用
smo = SMOTE(random_state=42)
X_train, y_train = smo.fit_resample(X_train, y_train)

In [16]:
# logistic classfication model
model = LogisticRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_dev)
print(classification_report(y_dev, prediction))

acc = metrics.accuracy_score(y_dev, prediction)
f1 = metrics.f1_score(y_dev, prediction,average='macro')
print(f'acc:{acc}, f1:{f1}')

                        precision    recall  f1-score   support

        american (new)       0.46      0.48      0.47       271
american (traditional)       0.69      0.73      0.71       545
          asian fusion       0.42      0.44      0.43        61
        canadian (new)       0.37      0.62      0.46        78
               chinese       0.94      0.93      0.93       338
               italian       0.93      0.83      0.88       412
              japanese       0.92      0.86      0.89       226
         mediterranean       0.90      0.78      0.84       137
               mexican       0.98      0.94      0.96       460
                  thai       0.96      0.91      0.93       101

              accuracy                           0.79      2629
             macro avg       0.76      0.75      0.75      2629
          weighted avg       0.81      0.79      0.80      2629

acc:0.7945987067325979, f1:0.7508002268796533


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
prediction = model.predict(X_dev)
print(classification_report(y_dev, prediction))

acc = metrics.accuracy_score(y_dev, prediction)
f1 = metrics.f1_score(y_dev, prediction,average='macro')
print(f'acc:{acc}, f1:{f1}')

                        precision    recall  f1-score   support

        american (new)       0.40      0.18      0.25       271
american (traditional)       0.61      0.83      0.70       545
          asian fusion       0.75      0.10      0.17        61
        canadian (new)       0.46      0.17      0.25        78
               chinese       0.87      0.95      0.91       338
               italian       0.85      0.89      0.87       412
              japanese       0.91      0.93      0.92       226
         mediterranean       0.87      0.80      0.83       137
               mexican       0.90      0.94      0.92       460
                  thai       0.92      0.89      0.90       101

              accuracy                           0.78      2629
             macro avg       0.76      0.67      0.67      2629
          weighted avg       0.76      0.78      0.75      2629

acc:0.7797641688855078, f1:0.673095839388868


In [18]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
prediction = model.predict(X_dev)
print(classification_report(y_dev, prediction))

acc = metrics.accuracy_score(y_dev, prediction)
f1 = metrics.f1_score(y_dev, prediction,average='macro')
print(f'acc:{acc}, f1:{f1}')

                        precision    recall  f1-score   support

        american (new)       0.51      0.34      0.41       271
american (traditional)       0.62      0.91      0.73       545
          asian fusion       0.52      0.26      0.35        61
        canadian (new)       0.45      0.26      0.33        78
               chinese       0.89      0.94      0.91       338
               italian       0.94      0.84      0.89       412
              japanese       0.92      0.88      0.90       226
         mediterranean       0.92      0.74      0.82       137
               mexican       0.98      0.92      0.95       460
                  thai       0.97      0.86      0.91       101

              accuracy                           0.80      2629
             macro avg       0.77      0.70      0.72      2629
          weighted avg       0.80      0.80      0.79      2629

acc:0.797641688855078, f1:0.720334977679156


In [19]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
prediction = model.predict(X_dev)
print(classification_report(y_dev, prediction))

acc = metrics.accuracy_score(y_dev, prediction)
f1 = metrics.f1_score(y_dev, prediction,average='macro')
print(f'acc:{acc}, f1:{f1}')

                        precision    recall  f1-score   support

        american (new)       0.22      0.23      0.22       271
american (traditional)       0.53      0.53      0.53       545
          asian fusion       0.11      0.18      0.14        61
        canadian (new)       0.27      0.38      0.32        78
               chinese       0.80      0.77      0.79       338
               italian       0.79      0.73      0.76       412
              japanese       0.83      0.75      0.79       226
         mediterranean       0.64      0.57      0.60       137
               mexican       0.86      0.85      0.85       460
                  thai       0.84      0.89      0.87       101

              accuracy                           0.64      2629
             macro avg       0.59      0.59      0.59      2629
          weighted avg       0.66      0.64      0.65      2629

acc:0.6397869912514264, f1:0.5864400283875524


In [20]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
prediction = model.predict(X_dev)
print(classification_report(y_dev, prediction))

acc = metrics.accuracy_score(y_dev, prediction)
f1 = metrics.f1_score(y_dev, prediction,average='macro')
print(f'acc:{acc}, f1:{f1}')

                        precision    recall  f1-score   support

        american (new)       0.30      0.59      0.40       271
american (traditional)       0.92      0.04      0.08       545
          asian fusion       0.16      0.49      0.24        61
        canadian (new)       0.12      0.62      0.20        78
               chinese       0.93      0.82      0.87       338
               italian       0.92      0.72      0.81       412
              japanese       0.85      0.83      0.84       226
         mediterranean       0.71      0.76      0.73       137
               mexican       0.98      0.82      0.89       460
                  thai       0.88      0.93      0.90       101

              accuracy                           0.61      2629
             macro avg       0.68      0.66      0.60      2629
          weighted avg       0.81      0.61      0.61      2629

acc:0.608596424496006, f1:0.5976158130873548


In [22]:
# 对整体的训练集进行smote数据增强
# 定义SMOTE模型，random_state相当于随机数种子的作用
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
train_vec, label = oversample.fit_resample(train_vec, label)

In [25]:
# 通过上面模型的观察，发现在验证集上效果最好的是SVC模型，所以在这里使用所有数据训练SVC模型
# 注意，train_vec表示所有数据，这里我依然是用X_dev和y_dev进行验证，其实这些数据已经被包含在train_vec，label里了
# 但是没关系，这里只是最后看一眼效果，并不是真的再拿它当验证集
from sklearn.svm import SVC
model = SVC()
model.fit(train_vec, label)
prediction = model.predict(X_dev)
print(classification_report(y_dev, prediction))

acc = metrics.accuracy_score(y_dev, prediction)
f1 = metrics.f1_score(y_dev, prediction,average='macro')
print(f'acc:{acc}, f1:{f1}')


                        precision    recall  f1-score   support

        american (new)       0.98      0.95      0.96       271
american (traditional)       0.97      0.98      0.98       545
          asian fusion       1.00      1.00      1.00        61
        canadian (new)       0.95      1.00      0.97        78
               chinese       1.00      1.00      1.00       338
               italian       1.00      0.99      0.99       412
              japanese       0.99      1.00      1.00       226
         mediterranean       0.98      0.99      0.99       137
               mexican       1.00      0.99      0.99       460
                  thai       1.00      1.00      1.00       101

              accuracy                           0.99      2629
             macro avg       0.99      0.99      0.99      2629
          weighted avg       0.99      0.99      0.99      2629

acc:0.9874476987447699, f1:0.9882910874110443


In [26]:
preds = model.predict(test_vec)
# in your implemetation, create the output file using the same format
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(preds):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted-tfidf-SVM-smote.csv", index=False)
