# (3) Classification by Machine Learning
3 algorithms, Naive-Bayes, Logistic Regression and Random Forest Classifier are used and compared.

In [None]:
# comment out this cell if we want to see warnings
import warnings 
warnings.filterwarnings('ignore')

In [81]:
# imports
import pandas as pd
import numpy as np
import re
import requests

from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix, classification_report

# set options for pandas DataFrame display
pd.set_option('display.max_colwidth', None)

In [2]:
# load datasets
japanese = pd.read_csv('datasets/japanese.csv')
korean = pd.read_csv('datasets/korean.csv')

In [3]:
# check number of rows
japanese.shape, korean.shape

((995, 1), (976, 1))

In [4]:
# remove non-English characters
japanese['post'] = japanese['post'].map(lambda x: re.sub(r"[^\x00-\x7F]", '', x))
korean['post'] = korean['post'].map(lambda x: re.sub(r"[^\x00-\x7F]", '', x))

In [5]:
# create X and y for modelling

# combine japanese and korean 'post' feature into single pandas Series
X = pd.concat([japanese['post'], korean['post']])

# create labels 1 for Japanese and 0 for Korean
y = pd.Series([1] * japanese.shape[0] + [0] * korean.shape[0], name='japanese')

In [6]:
# proportion of japanese posts vs korean posts
y.value_counts(normalize = True)

1    0.50482
0    0.49518
Name: japanese, dtype: float64

### Baseline Accuracy
The baseline accuracy is 0.505 if all posts are predicted to be from the Japanese subreddit.

In [7]:
# load stopwords lists
stopwords = pd.read_csv('datasets/stopwords.csv', index_col='Unnamed: 0')
# check indices
stopwords.index

Index(['eng_jk', 'common_eng', 'common_eng_jk', 'eng_lem', 'eng_jk_lem',
       'common_eng_lem', 'common_eng_jk_lem'],
      dtype='object')

In [8]:
# create function to retrieve stopwords list
def get_stopwords(index):
    sw_list = stopwords.loc[index]['list'].replace("'","")[1:-1].split(", ")
    return [re.sub(r'"', '', x) for x in sw_list]

In [9]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)
# check shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1478,), (493,), (1478,), (493,))

## Functions to Set and Evaluate Models

In [10]:
def set_pipe(algorithm='nb', vectorizer='cvec'):
    
    algo_dic = {
        'nb': MultinomialNB(),
        'logreg': LogisticRegression(solver='liblinear', random_state=42),
        'forest': RandomForestClassifier(random_state=42)
    }

    vect_dic = {
        'cvec': CountVectorizer(token_pattern=r"\w+"),
        'tvec': TfidfVectorizer(token_pattern=r"\w+")
    }
        
    pipe = Pipeline([
        (vectorizer, vect_dic[vectorizer]),
        (algorithm, algo_dic[algorithm])
    ])
    
    return pipe

In [11]:
def set_model(
    algorithm='nb',
    vectorizer='cvec',
    max_df=[1.0], 
    max_features=[None], 
    min_df=[1], 
    ngram_range=[(1,1)],
    preprocessor=[None], 
    stopwords=[None], 
    algo_params={}
):
    
    params = {
        vectorizer+'__max_df': max_df,
        vectorizer+'__max_features': max_features,
        vectorizer+'__min_df': min_df,
        vectorizer+'__ngram_range': ngram_range,
        vectorizer+'__preprocessor': preprocessor,
        vectorizer+'__stop_words': stopwords,
    }
    params.update(algo_params)
    
    return GridSearchCV(set_pipe(algorithm, vectorizer), params, cv=5)

In [12]:
def eval_model(model):
    # Fit model to training data.
    model.fit(X_train, y_train)
    
    print(f"Best Score: {model.best_score_}")
    print(f"Train Score: {model.score(X_train, y_train)}")
    print(f"Test Score: {model.score(X_test, y_test)}")
    print(classification_report(y_test, model.predict(X_test)))

In [13]:
# for Multinomial Naive-Bayes and Logistic Regression
def get_coef(model):
    coef = model.best_estimator_.steps[1][1].coef_
    features = model.best_estimator_.steps[0][1].get_feature_names_out()
    df = pd.DataFrame(coef, columns=features, index=['coef'] ).T.sort_values('coef')
    display(df.head(10))
    display(df.tail(10))

In [63]:
# for Random Forest Classifier
def get_feature_impt(model):
    importances = model.best_estimator_.steps[1][1].feature_importances_
    features = model.best_estimator_.steps[0][1].get_feature_names_out()
    df = pd.DataFrame(zip(features,importances)).sort_values(1, ascending=False)
    display(df.head(20))

## Untuned Models
The first set of untuned models are run without removing the keywords 'japanese' and 'korean', while the second set are run after removing these 2 obvious keywords.
### Keywords 'japanese' and 'korean' NOT Removed

In [16]:
nb_cvec = set_model('nb', 'cvec')
eval_model(nb_cvec)

Best Score: 0.84439761795694
Train Score: 0.9296346414073072
Test Score: 0.8823529411764706
              precision    recall  f1-score   support

           0       0.87      0.90      0.88       244
           1       0.90      0.86      0.88       249

    accuracy                           0.88       493
   macro avg       0.88      0.88      0.88       493
weighted avg       0.88      0.88      0.88       493



In [17]:
nb_tvec = set_model('nb', 'tvec')
eval_model(nb_tvec)

Best Score: 0.8119285387081998
Train Score: 0.9451962110960758
Test Score: 0.8377281947261663
              precision    recall  f1-score   support

           0       0.86      0.80      0.83       244
           1       0.82      0.88      0.84       249

    accuracy                           0.84       493
   macro avg       0.84      0.84      0.84       493
weighted avg       0.84      0.84      0.84       493



In [18]:
logreg_cvec = set_model('logreg', 'cvec')
eval_model(logreg_cvec)

Best Score: 0.8572721026110856
Train Score: 0.9871447902571042
Test Score: 0.8600405679513184
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       244
           1       0.89      0.83      0.86       249

    accuracy                           0.86       493
   macro avg       0.86      0.86      0.86       493
weighted avg       0.86      0.86      0.86       493



In [19]:
logreg_tvec = set_model('logreg', 'tvec')
eval_model(logreg_tvec)

Best Score: 0.8701053595968851
Train Score: 0.9654939106901218
Test Score: 0.8823529411764706
              precision    recall  f1-score   support

           0       0.84      0.93      0.89       244
           1       0.93      0.83      0.88       249

    accuracy                           0.88       493
   macro avg       0.89      0.88      0.88       493
weighted avg       0.89      0.88      0.88       493



In [20]:
forest_cvec = set_model('forest', 'cvec')
eval_model(forest_cvec)

Best Score: 0.8599610627576728
Train Score: 0.9952638700947226
Test Score: 0.8661257606490872
              precision    recall  f1-score   support

           0       0.81      0.95      0.88       244
           1       0.94      0.79      0.86       249

    accuracy                           0.87       493
   macro avg       0.88      0.87      0.87       493
weighted avg       0.88      0.87      0.87       493



In [21]:
forest_tvec = set_model('forest', 'tvec')
eval_model(forest_tvec)

Best Score: 0.8572514887769124
Train Score: 0.9952638700947226
Test Score: 0.8356997971602435
              precision    recall  f1-score   support

           0       0.80      0.90      0.84       244
           1       0.89      0.78      0.83       249

    accuracy                           0.84       493
   macro avg       0.84      0.84      0.84       493
weighted avg       0.84      0.84      0.84       493



### Results without removing keywords 'japanese' and 'korean'
|Algorithm|Vectorizer|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|---|
|Multinomial Naive-Bayes|CountVectorizer|0.844|0.930|0.882|0.88|0.86|0.90|
|Multinomial Naive-Bayes|TfidfVectorizer|0.812|0.945|0.838|0.84|0.88|0.80|
|Logistic Regression|CountVectorizer|0.857|0.987|0.860|0.86|0.83|0.89|
|Logistic Regression|TfidfVectorizer|0.870|0.965|0.882|0.88|0.83|0.93|
|Random Forest Classifier|CountVectorizer|0.860|0.995|0.866|0.87|0.79|0.95|
|Random Forest Classifier|TfidfVectorizer|0.857|0.995|0.836|0.84|0.78|0.90|

The scores are all quite good with some overfit as can be seen from the higher Train Scores (above 0.9) compared to Test Scores (between 0.8 and 0.9). Random Forest Classifier greatly overfits the training data with Train Scores very close to 1.0.

All models fare similarly in terms of the confusion matrix, with f1 Scores between 0.84 to 0.88. The rates of misclassifying Japanese as Korean vs misclassifying Korean as Japanese are about the same, though the Logistic Regression and Random Forest Classifier models have a lower sensitivity than specificity (misclassifies Japanese posts as Korean more frequently).

### Keywords 'japanese' and 'korean' Removed

In [22]:
nb_cvec_jk = set_model('nb', 'cvec', stopwords=[['japanese', 'korean']])
eval_model(nb_cvec_jk)

Best Score: 0.7361245991754466
Train Score: 0.8944519621109608
Test Score: 0.7829614604462475
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       244
           1       0.78      0.80      0.79       249

    accuracy                           0.78       493
   macro avg       0.78      0.78      0.78       493
weighted avg       0.78      0.78      0.78       493



In [23]:
nb_tvec_jk = set_model('nb', 'tvec', stopwords=[['japanese', 'korean']])
eval_model(nb_tvec_jk)

Best Score: 0.7165139715987173
Train Score: 0.9133964817320703
Test Score: 0.7505070993914807
              precision    recall  f1-score   support

           0       0.80      0.66      0.72       244
           1       0.71      0.84      0.77       249

    accuracy                           0.75       493
   macro avg       0.76      0.75      0.75       493
weighted avg       0.76      0.75      0.75       493



In [24]:
logreg_cvec_jk = set_model('logreg', 'cvec', stopwords=[['japanese', 'korean']])
eval_model(logreg_cvec_jk)

Best Score: 0.7368048557031608
Train Score: 0.979702300405954
Test Score: 0.7363083164300203
              precision    recall  f1-score   support

           0       0.71      0.78      0.75       244
           1       0.76      0.69      0.73       249

    accuracy                           0.74       493
   macro avg       0.74      0.74      0.74       493
weighted avg       0.74      0.74      0.74       493



In [25]:
logreg_tvec_jk = set_model('logreg', 'tvec', stopwords=[['japanese', 'korean']])
eval_model(logreg_tvec_jk)

Best Score: 0.7462620247366011
Train Score: 0.922192151556157
Test Score: 0.768762677484787
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       244
           1       0.81      0.71      0.76       249

    accuracy                           0.77       493
   macro avg       0.77      0.77      0.77       493
weighted avg       0.77      0.77      0.77       493



In [26]:
forest_cvec_jk = set_model('forest', 'cvec', stopwords=[['japanese', 'korean']])
eval_model(forest_cvec_jk)

Best Score: 0.7185364177737059
Train Score: 0.9952638700947226
Test Score: 0.7342799188640974
              precision    recall  f1-score   support

           0       0.70      0.81      0.75       244
           1       0.78      0.66      0.71       249

    accuracy                           0.73       493
   macro avg       0.74      0.74      0.73       493
weighted avg       0.74      0.73      0.73       493



In [27]:
forest_tvec_jk = set_model('forest', 'tvec', stopwords=[['japanese', 'korean']])
eval_model(forest_tvec_jk)

Best Score: 0.717888227210261
Train Score: 0.9952638700947226
Test Score: 0.7200811359026369
              precision    recall  f1-score   support

           0       0.70      0.76      0.73       244
           1       0.74      0.68      0.71       249

    accuracy                           0.72       493
   macro avg       0.72      0.72      0.72       493
weighted avg       0.72      0.72      0.72       493



### Results after removing keywords 'japanese' and 'korean'
|Algorithm|Vectorizer|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|---|
|Multinomial Naive-Bayes|CountVectorizer|0.736|0.894|0.783|0.78|0.80|0.77|
|Multinomial Naive-Bayes|TfidfVectorizer|0.717|0.913|0.751|0.75|0.84|0.66|
|Logistic Regression|CountVectorizer|0.737|0.980|0.736|0.74|0.69|0.78|
|Logistic Regression|TfidfVectorizer|0.746|0.922|0.769|0.77|0.71|0.82|
|Random Forest Classifier|CountVectorizer|0.719|0.995|0.734|0.73|0.66|0.81|
|Random Forest Classifier|TfidfVectorizer|0.718|0.995|0.720|0.72|0.68|0.76|

As expected, the scores have all dropped while overfit is even more pronounced with Test Scores dropping to below 0.8. Random Forest Classifier still greatly overfits the training data with Train Scores very close to 1.0.

The f1 Scores have all dropped to below 0.8.

## Modelling with Lemmatization

In [28]:
nb_cvec_jk_lem = set_model('nb', 'cvec', 
                           stopwords=[['japanese', 'korean']], 
                           preprocessor=[WordNetLemmatizer().lemmatize])
eval_model(nb_cvec_jk_lem)

Best Score: 0.7361245991754466
Train Score: 0.8944519621109608
Test Score: 0.7829614604462475
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       244
           1       0.78      0.80      0.79       249

    accuracy                           0.78       493
   macro avg       0.78      0.78      0.78       493
weighted avg       0.78      0.78      0.78       493



In [29]:
nb_tvec_jk_lem = set_model('nb', 'tvec', 
                           stopwords=[['japanese', 'korean']], 
                           preprocessor=[WordNetLemmatizer().lemmatize])
eval_model(nb_tvec_jk_lem)

Best Score: 0.7165139715987173
Train Score: 0.9133964817320703
Test Score: 0.7505070993914807
              precision    recall  f1-score   support

           0       0.80      0.66      0.72       244
           1       0.71      0.84      0.77       249

    accuracy                           0.75       493
   macro avg       0.76      0.75      0.75       493
weighted avg       0.76      0.75      0.75       493



In [30]:
logreg_cvec_jk_lem = set_model('logreg', 'cvec', 
                               stopwords=[['japanese', 'korean']], 
                               preprocessor=[WordNetLemmatizer().lemmatize])
eval_model(logreg_cvec_jk_lem)

Best Score: 0.7368048557031608
Train Score: 0.979702300405954
Test Score: 0.7363083164300203
              precision    recall  f1-score   support

           0       0.71      0.78      0.75       244
           1       0.76      0.69      0.73       249

    accuracy                           0.74       493
   macro avg       0.74      0.74      0.74       493
weighted avg       0.74      0.74      0.74       493



In [31]:
logreg_tvec_jk_lem = set_model('logreg', 'tvec', 
                               stopwords=[['japanese', 'korean']], 
                               preprocessor=[WordNetLemmatizer().lemmatize])
eval_model(logreg_tvec_jk_lem)

Best Score: 0.7462620247366011
Train Score: 0.922192151556157
Test Score: 0.768762677484787
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       244
           1       0.81      0.71      0.76       249

    accuracy                           0.77       493
   macro avg       0.77      0.77      0.77       493
weighted avg       0.77      0.77      0.77       493



In [32]:
forest_cvec_jk_lem = set_model('forest', 'cvec', 
                               stopwords=[['japanese', 'korean']], 
                               preprocessor=[WordNetLemmatizer().lemmatize])
eval_model(forest_cvec_jk_lem)

Best Score: 0.7185364177737059
Train Score: 0.9952638700947226
Test Score: 0.7342799188640974
              precision    recall  f1-score   support

           0       0.70      0.81      0.75       244
           1       0.78      0.66      0.71       249

    accuracy                           0.73       493
   macro avg       0.74      0.74      0.73       493
weighted avg       0.74      0.73      0.73       493



In [34]:
forest_tvec_jk_lem = set_model('forest', 'tvec', 
                               stopwords=[['japanese', 'korean']], 
                               preprocessor=[WordNetLemmatizer().lemmatize])
eval_model(forest_tvec_jk_lem)

Best Score: 0.717888227210261
Train Score: 0.9952638700947226
Test Score: 0.7200811359026369
              precision    recall  f1-score   support

           0       0.70      0.76      0.73       244
           1       0.74      0.68      0.71       249

    accuracy                           0.72       493
   macro avg       0.72      0.72      0.72       493
weighted avg       0.72      0.72      0.72       493



### Results with Lemmatization
The results are the same for all models. This is consistent with the analysis from part (2) Data Analysis, where it was found that there was very little difference between the lemmatized and non-lemmatized sets of common words in the 2 subreddits. The models are tuned without Lemmatization using GridSearch.

## Tuning Models without Lemmatization

In [35]:
# first attempt at tuning nb cvec model
nb_cvec_tuned = set_model('nb', 'cvec', 
                          max_df=[.2,.8],
                          max_features=[None, 3000, 5000],
                          min_df=[1,2],
                          ngram_range=[(1,1),(1,2)],
                          stopwords=[['japanese', 'korean'], 
                                     get_stopwords('eng_jk'), 
                                     get_stopwords('common_eng_jk')], 
                          algo_params={
                              'nb__alpha': [1.0e-10,.1,1.0]
                          })
eval_model(nb_cvec_tuned)

Best Score: 0.7598190563444801
Train Score: 0.8930987821380244
Test Score: 0.7890466531440162
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       244
           1       0.78      0.81      0.80       249

    accuracy                           0.79       493
   macro avg       0.79      0.79      0.79       493
weighted avg       0.79      0.79      0.79       493



In [36]:
nb_cvec_tuned.best_params_

{'cvec__max_df': 0.2,
 'cvec__max_features': 5000,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'cvec__preprocessor': None,
 'cvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'thatll',
  't

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.736|0.894|0.783|0.78|0.80|0.77|
|1st Tuning|0.760|0.893|0.789|0.79|0.81|0.77|

There is a slight improvement in scores and the overfit is reduced.

The best parameters of min_df=1, stopwords=common_eng_jk are retained and the rest undergo a second set of tuning.

In [38]:
# second attempt at tuning nb cvec model
nb_cvec_tuned_2 = set_model('nb', 'cvec', 
                            max_df=[.1,.2,.4],
                            max_features=[4000,5000,6000],
                            ngram_range=[(1,2),(1,3),(2,3)],
                            stopwords=[get_stopwords('common_eng_jk')], 
                            algo_params={
                                'nb__alpha': [.8,1.0]
                            })
eval_model(nb_cvec_tuned_2)

Best Score: 0.7625217590471827
Train Score: 0.8944519621109608
Test Score: 0.7931034482758621
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       244
           1       0.78      0.82      0.80       249

    accuracy                           0.79       493
   macro avg       0.79      0.79      0.79       493
weighted avg       0.79      0.79      0.79       493



In [39]:
nb_cvec_tuned_2.best_params_

{'cvec__max_df': 0.2,
 'cvec__max_features': 5000,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 3),
 'cvec__preprocessor': None,
 'cvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'thatll',
  't

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.736|0.894|0.783|0.78|0.80|0.77|
|1st Tuning|0.760|0.893|0.789|0.79|0.81|0.77|
|2nd Tuning|0.763|0.894|0.793|0.79|0.82|0.77|

There is only a slight decrease in scores and reduction in overfit.

In [40]:
# first attempt at tuning nb tvec model
nb_tvec_tuned = set_model('nb', 'tvec', 
                          max_df=[.2,.8],
                          max_features=[None, 3000, 5000],
                          min_df=[1,2],
                          ngram_range=[(1,1),(1,2)],
                          stopwords=[['japanese', 'korean'], 
                                     get_stopwords('eng_jk'), 
                                     get_stopwords('common_eng_jk')], 
                          algo_params={
                              'nb__alpha': [1.0e-10,.1,1.0]
                          })
eval_model(nb_tvec_tuned)

Best Score: 0.7604947320201557
Train Score: 0.959404600811908
Test Score: 0.7789046653144016
              precision    recall  f1-score   support

           0       0.80      0.74      0.77       244
           1       0.76      0.82      0.79       249

    accuracy                           0.78       493
   macro avg       0.78      0.78      0.78       493
weighted avg       0.78      0.78      0.78       493



In [41]:
nb_tvec_tuned.best_params_

{'nb__alpha': 0.1,
 'tvec__max_df': 0.2,
 'tvec__max_features': None,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 2),
 'tvec__preprocessor': None,
 'tvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that

|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.717|0.913|0.751|0.75|0.84|0.66|
|1st Tuning|0.760|0.959|0.779|0.78|0.82|0.74|

There are some improvement in scores, but the overfit also became more pronounced. The f1 score improved as the sensitivity dropped but the specificity improved more. 

The best parameters max_features=None, stopwords=common_eng_jk are retained and the rest undergo a second set of tuning.

In [50]:
# second attempt at tuning nb tvec model
nb_tvec_tuned_2 = set_model('nb', 'tvec', 
                            max_df=[.1,.2,.4],
                            min_df=[2,3,4],
                            ngram_range=[(1,2), (1,3), (2,3)],
                            stopwords=[get_stopwords('common_eng_jk')], 
                            algo_params={
                                'nb__alpha': [.1,.2,.4]
                            })
eval_model(nb_tvec_tuned_2)

Best Score: 0.7611704076958314
Train Score: 0.945872801082544
Test Score: 0.7849898580121704
              precision    recall  f1-score   support

           0       0.80      0.75      0.78       244
           1       0.77      0.82      0.79       249

    accuracy                           0.78       493
   macro avg       0.79      0.78      0.78       493
weighted avg       0.79      0.78      0.78       493



In [51]:
nb_tvec_tuned_2.best_params_

{'nb__alpha': 0.4,
 'tvec__max_df': 0.2,
 'tvec__max_features': None,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 2),
 'tvec__preprocessor': None,
 'tvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that

|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.717|0.913|0.751|0.75|0.84|0.66|
|1st Tuning|0.760|0.959|0.779|0.78|0.82|0.74|
|2nd Tuning|0.761|0.946|0.785|0.78|0.82|0.75|

There is a slight improvement in scores and the overfit is reduced.

In [42]:
# first attempt at tuning logreg cvec model
logreg_cvec_tuned = set_model('logreg', 'cvec', 
                              max_df=[.2,.8],
                              max_features=[None, 3000, 5000],
                              min_df=[1,2],
                              ngram_range=[(1,1),(1,2)],
                              stopwords=[['japanese', 'korean'], 
                                         get_stopwords('eng_jk'), 
                                         get_stopwords('common_eng_jk')], 
                              algo_params={
                                  'logreg__C': [.2,.8],
                                  'logreg__max_iter': [100, 200],
                                  'logreg__penalty': ['l1', 'l2']
                              })
eval_model(logreg_cvec_tuned)

Best Score: 0.7604901511681172
Train Score: 0.969553450608931
Test Score: 0.7626774847870182
              precision    recall  f1-score   support

           0       0.73      0.83      0.78       244
           1       0.81      0.70      0.75       249

    accuracy                           0.76       493
   macro avg       0.77      0.76      0.76       493
weighted avg       0.77      0.76      0.76       493



In [43]:
# check best parameters
logreg_cvec_tuned.best_params_

{'cvec__max_df': 0.2,
 'cvec__max_features': None,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__preprocessor': None,
 'cvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'thatll',
  't

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.737|0.980|0.736|0.74|0.69|0.78|
|1st Tuning|0.760|0.970|0.763|0.76|0.70|0.83|

There is an improvement in scores and the overfit is reduced.

The best parameters of max_features=None, stopwords=common_eng_jk, max_iter=100, penalty='l2' are retained and the rest undergo a second set of tuning.

In [52]:
# second attempt at tuning logreg cvec model
logreg_cvec_tuned_2 = set_model('logreg', 'cvec', 
                                max_df=[.1,.2,.4],
                                min_df=[2,3,4],
                                stopwords=[get_stopwords('common_eng_jk')], 
                                algo_params={
                                    'logreg__C': [.6,.8,1.0]
                                })
eval_model(logreg_cvec_tuned_2)

Best Score: 0.7537379752633991
Train Score: 0.959404600811908
Test Score: 0.7545638945233266
              precision    recall  f1-score   support

           0       0.72      0.82      0.77       244
           1       0.79      0.69      0.74       249

    accuracy                           0.75       493
   macro avg       0.76      0.76      0.75       493
weighted avg       0.76      0.75      0.75       493



In [53]:
# check best parameters
logreg_cvec_tuned_2.best_params_

{'cvec__max_df': 0.2,
 'cvec__max_features': None,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'thatll',
  't

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.737|0.980|0.736|0.74|0.69|0.78|
|1st Tuning|0.760|0.970|0.763|0.76|0.70|0.83|
|2nd Tuning|0.754|0.959|0.755|0.75|0.69|0.82|

The scores dropped slightly. The model from the first tuning is preferred.

In [44]:
# first attempt at tuning logreg tvec model
logreg_tvec_tuned = set_model('logreg', 'tvec', 
                              max_df=[.2,.8],
                              max_features=[None, 3000, 5000],
                              min_df=[1,2],
                              ngram_range=[(1,1),(1,2)],
                              stopwords=[['japanese', 'korean'], 
                                         get_stopwords('eng_jk'), 
                                         get_stopwords('common_eng_jk')], 
                              algo_params={
                                  'logreg__C': [.2,.8],
                                  'logreg__max_iter': [100, 200],
                                  'logreg__penalty': ['l1','l2']
                              })
eval_model(logreg_tvec_tuned)

Best Score: 0.7645327530920751
Train Score: 0.9201623815967523
Test Score: 0.7606490872210954
              precision    recall  f1-score   support

           0       0.73      0.81      0.77       244
           1       0.79      0.71      0.75       249

    accuracy                           0.76       493
   macro avg       0.76      0.76      0.76       493
weighted avg       0.76      0.76      0.76       493



In [45]:
logreg_tvec_tuned.best_params_

{'logreg__C': 0.8,
 'logreg__max_iter': 100,
 'logreg__penalty': 'l2',
 'tvec__max_df': 0.2,
 'tvec__max_features': 5000,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__preprocessor': None,
 'tvec__stop_words': ['japanese', 'korean']}

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.746|0.922|0.769|0.77|0.71|0.82|
|1st Tuning|0.765|0.920|0.761|0.76|0.71|0.81|

The Best Score is higher but all other scores got worse, including more overfitting. The tuning of this model is abandoned.

In [46]:
# first attempt at tuning forest cvec model
forest_cvec_tuned = set_model('forest', 'cvec', 
                              max_df=[.6,.8,1],
                              max_features=[None,3000],
                              min_df=[1,2],
                              ngram_range=[(1,1),(1,2)],
                              stopwords=[['japanese', 'korean'], 
                                         get_stopwords('common_eng_jk')], 
                              algo_params={
                                  'forest__n_estimators': [100,200],
                                  'forest__max_depth': [None,5,10],
                                  'forest__min_samples_split': [2,3],
                                  'forest__ccp_alpha': [0,.1],
                                  'forest__n_jobs': [-1],
#                                   'forest__verbose': [1]
                              })
eval_model(forest_cvec_tuned)

960 fits failed out of a total of 5760.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8

Best Score: 0.7598007329363261
Train Score: 0.9851150202976996
Test Score: 0.7525354969574036
              precision    recall  f1-score   support

           0       0.74      0.77      0.76       244
           1       0.77      0.73      0.75       249

    accuracy                           0.75       493
   macro avg       0.75      0.75      0.75       493
weighted avg       0.75      0.75      0.75       493



In [47]:
forest_cvec_tuned.best_params_

{'cvec__max_df': 0.6,
 'cvec__max_features': 3000,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'cvec__preprocessor': None,
 'cvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'thatll',
  't

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.719|0.995|0.734|0.73|0.66|0.81|
|1st Tuning|0.760|0.985|0.753|0.75|0.73|0.77|

The scores improved and the overfit has been reduced.

The best parameters of min_df=1, stopwords=common_eng_jk, ccp_alpha=0, max_depth=None, min_samples_split=2 are retained and the rest undergo a second set of tuning.

In [54]:
# second attempt at tuning forest cvec model
forest_cvec_tuned_2 = set_model('forest', 'cvec', 
                                max_df=[.2,.4,.6],
                                max_features=[3000,5000],
                                ngram_range=[(1,2),(1,3),(2,3)],
                                stopwords=[get_stopwords('common_eng_jk')], 
                                algo_params={
                                    'forest__n_estimators': [200,300],
                                    'forest__n_jobs': [-1],
#                                     'forest__verbose': [1]
                                })
eval_model(forest_cvec_tuned_2)

Best Score: 0.7598007329363261
Train Score: 0.9851150202976996
Test Score: 0.7525354969574036
              precision    recall  f1-score   support

           0       0.74      0.77      0.76       244
           1       0.77      0.73      0.75       249

    accuracy                           0.75       493
   macro avg       0.75      0.75      0.75       493
weighted avg       0.75      0.75      0.75       493



In [55]:
forest_cvec_tuned_2.best_params_

{'cvec__max_df': 0.2,
 'cvec__max_features': 3000,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'cvec__preprocessor': None,
 'cvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  'thatll',
  't

### Results
The parameters chosen did not change, so the results are identical to the previous set.

In [48]:
# first attempt at tuning forest tvec model
forest_tvec_tuned = set_model('forest', 'tvec', 
                              max_df=[.6,.8,1],
                              max_features=[None,5000],
                              min_df=[1,2],
                              ngram_range=[(1,1),(1,2)],
                              stopwords=[['japanese', 'korean'], 
                                         get_stopwords('common_eng_jk')], 
                              algo_params={
                                  'forest__n_estimators': [100,200],
                                  'forest__max_depth': [None,5,10],
                                  'forest__min_samples_split': [2,3],
                                  'forest__ccp_alpha': [0,.1],
                                  'forest__n_jobs': [-1],
#                                   'forest__verbose': [1]
                              })
eval_model(forest_tvec_tuned)

960 fits failed out of a total of 5760.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/family/opt/anaconda3/envs/nov20/lib/python3.8

Best Score: 0.7645442052221714
Train Score: 0.993234100135318
Test Score: 0.7261663286004056
              precision    recall  f1-score   support

           0       0.70      0.79      0.74       244
           1       0.76      0.67      0.71       249

    accuracy                           0.73       493
   macro avg       0.73      0.73      0.73       493
weighted avg       0.73      0.73      0.73       493



In [49]:
forest_tvec_tuned.best_params_

{'forest__ccp_alpha': 0,
 'forest__max_depth': None,
 'forest__min_samples_split': 3,
 'forest__n_estimators': 100,
 'forest__n_jobs': -1,
 'tvec__max_df': 0.6,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__preprocessor': None,
 'tvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itsel

### Results
|Tuning|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|
|Before|0.718|0.995|0.720|0.72|0.68|0.76|
|1st Tuning|0.765|0.993|0.726|0.73|0.67|0.79|

The scores improved and the overfit has been slightly reduced.

The best parameters of max_features=None, min_df=1, ngram_range=(1,1), stopwords=common_eng_jk, ccp_alpha=0, max_depth=None, n_estimators=100 are retained and the rest undergo a second set of tuning.

In [57]:
# second attempt at tuning forest tvec model
forest_tvec_tuned_2 = set_model('forest', 'tvec', 
                                max_df=[.2,.4,.6],
                                stopwords=[get_stopwords('common_eng_jk')], 
                                algo_params={
                                    'forest__min_samples_split': [3,5,7],
                                    'forest__n_jobs': [-1],
#                                     'forest__verbose': [1]
                                })
eval_model(forest_tvec_tuned_2)

Best Score: 0.7645442052221714
Train Score: 0.993234100135318
Test Score: 0.7261663286004056
              precision    recall  f1-score   support

           0       0.70      0.79      0.74       244
           1       0.76      0.67      0.71       249

    accuracy                           0.73       493
   macro avg       0.73      0.73      0.73       493
weighted avg       0.73      0.73      0.73       493



In [58]:
forest_tvec_tuned_2.best_params_

{'forest__min_samples_split': 3,
 'forest__n_jobs': -1,
 'tvec__max_df': 0.2,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__preprocessor': None,
 'tvec__stop_words': ['word',
  'understand',
  'good',
  'also',
  'lot',
  'used',
  'learning',
  'much',
  'something',
  'english',
  'anyone',
  'first',
  'ive',
  'learn',
  'know',
  'way',
  'words',
  'sentence',
  'like',
  'people',
  'help',
  'dont',
  'get',
  'grammar',
  'really',
  'read',
  'find',
  'time',
  'want',
  'use',
  'language',
  'would',
  'think',
  'im',
  'study',
  'one',
  'studying',
  'i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  'youre',
  'youve',
  'youll',
  'youd',
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  'shes',
  'her',
  'hers',
  'herself',
  'it',
  'its',
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',

### Results
The best parameter for max_df changed from 0.6 to 0.2 but the scores remained the same.

## Summary of Results
|Algorithm|Vectorizer|Best Score|Train Score|Test Score|f1 Score|Sensitivity|Specificity|
|---|---|---|---|---|---|---|---|
|Multinomial Naive-Bayes|CountVectorizer|0.763|0.894|0.793|0.79|0.82|0.77|
|Multinomial Naive-Bayes|TfidfVectorizer|0.761|0.946|0.785|0.78|0.82|0.75|
|Logistic Regression|CountVectorizer|0.760|0.970|0.763|0.76|0.70|0.83|
|Logistic Regression|TfidfVectorizer|0.746|0.922|0.769|0.77|0.71|0.82|
|Random Forest Classifier|CountVectorizer|0.760|0.985|0.753|0.75|0.73|0.77|
|Random Forest Classifier|CountVectorizer|0.765|0.993|0.726|0.73|0.67|0.79|

The scores are quite similar for all models.

## Important Features

In [56]:
# for Multinomial Naive-Bayes CountVectorizer
get_coef(nb_cvec_tuned_2)



Unnamed: 0,coef
sound natural,-10.599038
feel comfortable,-10.599038
feel better someone,-10.599038
feel better,-10.599038
speak speak,-10.599038
speak speak speak,-10.599038
feed,-10.599038
speaker appreciate,-10.599038
speaker appreciate idk,-10.599038
speaker saying,-10.599038


Unnamed: 0,coef
japan,-5.752884
looking,-5.743109
start,-5.668168
make,-5.650278
anki,-5.632703
using,-5.573514
n,-5.565338
reading,-5.44286
genki,-5.340241
kanji,-4.428382


In [59]:
# for Multinomial Naive-Bayes TfidfVectorizer
get_coef(nb_tvec_tuned_2)



Unnamed: 0,coef
listening reading,-9.543967
someone talk,-9.543967
someone says,-9.543967
developer,-9.543967
developers,-9.543967
someone pronounce,-9.543967
someone name,-9.543967
dict,-9.543967
someone journey,-9.543967
someone interested,-9.543967


Unnamed: 0,coef
app,-6.40904
katakana,-6.336903
question,-6.312437
reading,-6.297708
hiragana,-6.281219
anki,-6.248949
start,-6.14195
n,-6.132406
genki,-6.002521
kanji,-5.180831


For Multinomial Naive-Bayes, only the important features for classifying a post as Japanese can be picked out from the words with less negative coefficients. For example, in the TfidfVectorizer model, the writing systems 'katakana', 'hiragana' and 'kanji', the commonly used flashcard app 'anki' and the commonly used textbook 'genki'. The other words like 'app', 'question', 'reading', 'start' and 'n' have no specific link to the Japanese language and are just as likely to appear in the Korean subreddit. It is likely that the model overtrained on these features. The top ten features picked up by the CountVectorizer model are even less specific to Japanese.

In [60]:
get_coef(logreg_cvec_tuned)

Unnamed: 0,coef
korea,-1.634645
ttmik,-1.262089
hangul,-1.11633
hanja,-0.852709
interchangeable,-0.815994
intermediate,-0.777432
friends,-0.767315
topik,-0.748029
exchange,-0.738193
kpop,-0.73195


Unnamed: 0,coef
nihongo,0.963521
app,0.992204
jlpt,1.035425
katakana,1.07391
anime,1.179843
hiragana,1.239954
n,1.272942
genki,1.674153
japan,1.698325
kanji,2.385682


In [61]:
get_coef(logreg_tvec_jk)

Unnamed: 0,coef
korea,-1.935336
ttmik,-1.715643
name,-1.373717
you,-1.272829
hanja,-1.155859
hangul,-1.15313
topik,-1.095048
intermediate,-1.090534
someone,-1.055295
talk,-1.046582


Unnamed: 0,coef
through,1.351978
anime,1.377047
anki,1.484723
app,1.49771
katakana,1.864233
hiragana,1.933414
japan,2.039729
n,2.367527
genki,2.477423
kanji,4.604094


For Logistic Regression, the more negative coefficients are words linked to Korean while the more positive coefficients are words linked to Japanese. For example, there are words related to the writing systems ('hangul' and 'hanja' for Korean and 'katakana', 'hiragana' and 'kanji' for Japanese), words related to the popular cultures ('kpop' for Korean and 'anime' for Japanese) and the proficiency tests ('topik' for Korean and 'jlpt' for Japanese). The words used for classifying which subreddit the posts belong to appear to be more correctly picked out, giving more confidence in the predictive powers of the models.

In [64]:
get_feature_impt(forest_cvec_tuned)

Unnamed: 0,0,1
1325,kanji,0.047046
985,genki,0.018677
1287,japan,0.014082
1135,hiragana,0.012954
1685,n,0.012795
1391,korea,0.011741
1345,katakana,0.009625
108,anime,0.009507
109,anki,0.008314
2447,start,0.007788


In [65]:
get_feature_impt(forest_tvec_tuned)

Unnamed: 0,0,1
3361,kanji,0.046214
2515,genki,0.016979
2825,hiragana,0.013641
3245,japan,0.01172
4128,n,0.00997
298,anki,0.008962
5966,start,0.008296
3486,korea,0.008064
3381,katakana,0.006966
340,app,0.006885


For Random Forest Classifier, the important features are similar to the ones picked out by Logistic Regression, but it cannot be determined which subreddit the word is linked to.

## Test Models on New Data
To check for future compatibility, one model for each of the 3 algorithms is selected and trained using the original full set of data (from 1 Jan 2021 to 2 Feb 2021) for use on future posts, to ensure that the models are not adversely affected by recent trends. They are tested on 100 newest posts (retrieved on 20 Jan 2022) from each subreddit, and their performances are evaluated.

In [112]:
X_train = X
y_train = y

In [113]:
# nb_cvec_tuned_2, logreg_tvec_tuned and forest_cvec_tuned are chosen
nb_chosen = set_model('nb', 'cvec', 
                      max_df=[.2],
                      max_features=[5000],
                      ngram_range=[(1,3)],
                      stopwords=[get_stopwords('common_eng_jk')], 
                      algo_params={
                          'nb__alpha': [.8]
                      })

logreg_chosen = set_model('logreg', 'tvec', 
                          max_df=[.2],
                          max_features=[5000],
                          min_df=[1],
                          ngram_range=[(1,1)],
                          stopwords=[['japanese', 'korean']], 
                          algo_params={
                              'logreg__C': [.8],
                              'logreg__max_iter': [100],
                              'logreg__penalty': ['l2']
                          })

forest_chosen = set_model('forest', 'cvec', 
                          max_df=[.6],
                          max_features=[3000],
                          min_df=[1],
                          ngram_range=[(1,2)],
                          stopwords=[get_stopwords('common_eng_jk')], 
                          algo_params={
                              'forest__n_estimators': [200],
                              'forest__max_depth': [None],
                              'forest__min_samples_split': [2],
                              'forest__ccp_alpha': [0],
                              'forest__n_jobs': [-1],
                          })

In [114]:
# fit on full dataset
nb_chosen.fit(X,y)
logreg_chosen.fit(X,y)
forest_chosen.fit(X,y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(token_pattern='\\w+')),
                                       ('forest',
                                        RandomForestClassifier(random_state=42))]),
             param_grid={'cvec__max_df': [0.6], 'cvec__max_features': [3000],
                         'cvec__min_df': [1], 'cvec__ngram_range': [(1, 2)],
                         'cvec__preprocessor': [None],
                         'cvec__stop_words': [['word', 'understand', 'good',
                                               'also', 'lot', 'used',
                                               'learning', 'much', 'something',
                                               'english', 'anyone', 'first',
                                               'ive', 'learn', 'know', 'way',
                                               'words', 'sentence', 'like',
                                               'peo

In [66]:
# function to scrape the web and put into 'subreddit' and 'post' into DataFrame
url = "https://api.pushshift.io/reddit/search/submission"

def get_new_posts(subreddit):
    # scrape the web
    params = {
        'subreddit': subreddit,
        'size': 100
    }
    res = requests.get(url, params)
    df = pd.DataFrame(res.json()['data'])[['subreddit', 'selftext', 'title']]
    
    # create feature 'post'
    df['post'] = df['title']+' '+df['selftext']
    
    # remove urls, '[removed]', numbers and punctuations, change to lower case
    df['post'] = df['post'].map(lambda x: re.sub(r'\[removed\]', '', re.sub(r'http\S+', '', x)))
    df['post'] = df['post'].map(lambda x: re.sub(r'[^\w\s]|\d', '', x))
    df['post'] = df['post'].map(lambda x: x.lower())
    df['post'] = df['post'].map(lambda x: re.sub(r"[^\x00-\x7F]", '', x))
    
    # drop duplicates
    df.drop_duplicates(subset='post', inplace=True)

    new_df = df[['subreddit','post']]
    return new_df

In [69]:
# get new posts
j_new = get_new_posts('LearnJapanese')
k_new = get_new_posts('korean')

In [77]:
# function to calculate sensitivity and specificity, and return dataframe of wrongly classified posts
def get_wrong_preds(model):
    j_preds = model.predict(j_new['post'])
    k_preds = model.predict(k_new['post'])
    print(f"Percentage of Japanese posts predicted correctly: {round(np.mean(j_preds),3)}")
    print(f"Percentage of Korean posts predicted correctly: {round(1 - np.mean(k_preds),3)}")
    j_wrong = j_new[~np.array(j_preds, dtype=bool)]
    k_wrong = k_new[np.array(k_preds, dtype=bool)]
    return pd.concat([j_wrong, k_wrong])

In [115]:
# for Multinomial Naive-Bayes
wrong_preds_nb = get_wrong_preds(nb_chosen)

Percentage of Japanese posts predicted correctly: 0.794
Percentage of Korean posts predicted correctly: 0.796


In [116]:
# for Logistic Regression
wrong_preds_logreg = get_wrong_preds(logreg_chosen)

Percentage of Japanese posts predicted correctly: 0.691
Percentage of Korean posts predicted correctly: 0.786


In [117]:
# for Random Forest Classifier
wrong_preds_forest = get_wrong_preds(forest_chosen)

Percentage of Japanese posts predicted correctly: 0.763
Percentage of Korean posts predicted correctly: 0.776


Multinomial Naive-Bayes performs slightly better than Random Forest Classifier, while Logistic Regression performs poorly on sensitivity with only 69% of the Japanese posts correctly classified.

The wrongly predicted Japanese posts are investigated and compared below.

In [118]:
# get indices of wrongly predicted Japanese posts
wrong_nb_ind = set(wrong_preds_nb[wrong_preds_nb['subreddit']=='LearnJapanese'].index)
wrong_logreg_ind = set(wrong_preds_logreg[wrong_preds_logreg['subreddit']=='LearnJapanese'].index)
wrong_forest_ind = set(wrong_preds_forest[wrong_preds_forest['subreddit']=='LearnJapanese'].index)

print(" Indices of wrong predictions in Japanese subreddit ".center(80, '='))
print(f"nb: {wrong_nb_ind}")
print(f"logreg: {wrong_logreg_ind}")
print(f"forest: {wrong_forest_ind}")

nb: {3, 5, 6, 8, 21, 22, 24, 27, 28, 29, 36, 52, 54, 55, 58, 59, 63, 69, 72, 91}
logreg: {2, 5, 8, 19, 21, 22, 24, 27, 28, 33, 36, 37, 38, 39, 52, 54, 55, 58, 59, 61, 62, 63, 68, 69, 72, 75, 83, 84, 86, 91}
forest: {2, 4, 5, 16, 19, 21, 24, 27, 28, 33, 36, 38, 54, 55, 58, 61, 62, 63, 64, 69, 71, 83, 84}


In [119]:
# display Japanese posts wrongly classified by all 3 models
wrong_common = wrong_nb_ind.intersection(wrong_logreg_ind.intersection(wrong_forest_ind))
j_new[j_new.index.isin(wrong_common)]

Unnamed: 0,subreddit,post
5,LearnJapanese,if im talking to a stranger and they make me mad do i still have to use form with them
21,LearnJapanese,what does this tattoo mean
24,LearnJapanese,classical vs modern japanese i was wondering if anyone knew the difference between modern and classical japanese if you were fluent in modern japanese could you understand classical or are they too different if theyre different how do peoplehistorians learn classical
27,LearnJapanese,what are some online chatrooms or discords dedicated to japanese learning ive gone on some of the japanese discords but theyre either memes and no place to actually ask questions or weird chat setups or only voice channels\n\nare there any solid places to talk to communicate with other people that are learning japanese\n\notherwise id have to ask my japanese friend about questions a day can i say to mean i didnt eat\n\nitd get pretty old
28,LearnJapanese,how do i say thank you for your love at me
36,LearnJapanese,how do you say thank you with a name is it thank you akasuki or\n\nakasuki thank you
54,LearnJapanese,can represent both a change of state and action in progress depending on context i was thinking of how little i understood about the form of verbs yesterday specifically i dont understand weather representing an action in progress or a change of state depends on the verb for example can mean either depending on weather your emphasizing the object or the subject as in\n\nampxb\n\n\n\nim making dinner\n\n\n\ndinner is made\n\nampxb\n\nor is this incorrect and can only take one of the prior two meanings
55,LearnJapanese,is it normal to be able to translate from japanese to english but not english to japanese will say im just started learning again and i am at chaoter of jfz when ever i have to translate english to japanese i find it really hard but not translating japanese to english
58,LearnJapanese,what does this say i got this sword that has some japanese letters on it id appreciate if someone could translate this for me \n\nthank you so much
63,LearnJapanese,are there any chill discord servers i can join to practice my japanese with other newcomers that isnt competitive about learning title pretty much says it all as a native english speaker slowly learning japanese i just dont know where to look thank you


Reading the actual Japanese posts that are misclassified as Korean, it is not surprising that these posts get misclassified (after the keyword 'japanese' is removed) because the contents are very generic (e.g. 'what does this tattoo mean'). It is unlikely that the models that be improved much further.

In [120]:
# probability of intersecting set of wrongly classified Japanese posts being predicted as Korean
nb_chosen.predict_proba(j_new[j_new.index.isin(wrong_common)]['post'])[:,0]

array([0.58742245, 0.77372577, 0.99644074, 0.97754915, 0.78845191,
       0.96371974, 0.99999759, 0.94299875, 0.99993798, 0.77869058,
       0.99978235])

In [121]:
logreg_chosen.predict_proba(j_new[j_new.index.isin(wrong_common)]['post'])[:,0]

array([0.51831852, 0.69835538, 0.54165618, 0.54345802, 0.69217949,
       0.84736174, 0.58750818, 0.64775623, 0.82371365, 0.58802708,
       0.55167248])

In [122]:
forest_chosen.predict_proba(j_new[j_new.index.isin(wrong_common)]['post'])[:,0]

array([0.63971428, 0.68842689, 0.61691613, 0.56976247, 0.74591667,
       0.93916667, 0.51574786, 0.76725   , 0.92416667, 0.775     ,
       0.56876374])

Multinomial Naive-Bayes does worse than the other 2 models as it assigned very high probabilities of the post being Korean to the common set of misclassified Japanese posts.

## Conclusion
There are sufficient differences between the 2 subreddits as 2 of the 3 trained models (Multinomial Naive-Bayes and Random Forest Classifier) accurately predict which subreddit the posts come from for more than 75% of the time. The Random Forest Classifier is chosen to be the production model as it did not assign very high probabilities to the misclassified posts.

## Possible Improvements in Future
1. This project used only 1000 posts from each subreddit. To improve the model, more posts can be scraped from the web so that more data could be used for training.
2. More hyperparameter tuning could be attempted to improve the models if there was more time.
3. The results of the 3 models could be combined using Voting Classifier.

In [111]:
# save j_new and k_new to csv for reference
j_new.to_csv('datasets/j_new.csv')
k_new.to_csv('datasets/k_new.csv')