In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
import re
from data_cleaning import *

# 1. Preprocessing

In [2]:
# read data
train_data = pd.read_csv("Train.csv", sep=',')
test_data = pd.read_csv("Test.csv", sep=',')

In [3]:
#separating instance and label for Train
X_train_raw = [x[0] for x in train_data[['text']].values]
Y_train = [x[0] for x in train_data[['sentiment']].values]
X_test_raw = [x[0] for x in test_data[['text']].values]

#### (1). data cleaning

In [4]:
# 1. data cleaning (optional)
X_train_need_to_clean = pd.DataFrame(X_train_raw)
X_test_need_to_clean = pd.DataFrame(X_test_raw)

# remove url, # and @
X_train_need_to_clean.replace("\b*https?:\S*", '', regex=True, inplace=True)
X_train_need_to_clean.replace("\b*@\S*", '', regex=True, inplace=True)
X_train_need_to_clean.replace("\b*#\S*", '', regex=True, inplace=True)
X_test_need_to_clean.replace("\b*https?:\S*", '', regex=True, inplace=True)
X_test_need_to_clean.replace("\b*@\S*", '', regex=True, inplace=True)
X_test_need_to_clean.replace("\b*#\S*", '', regex=True, inplace=True)

for i in range(X_train_need_to_clean.shape[0]):
    X_train_need_to_clean.loc[i, 0] = ' '.join(text_preprocessing(X_train_need_to_clean.loc[i, 0], remove_html=False))

X_train_clean = [x[0] for x in X_train_need_to_clean[[0]].values]
X_test_clean = [x[0] for x in X_test_need_to_clean[[0]].values]

In [12]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)
X_train_need_to_clean

Unnamed: 0,0
0,doctor hit campaign trail race medical council...
1,anybody go radio station tomorrow shawn friend...
2,find naruto not 5th hokage
3,prince george reservist die saturday want help...
4,season sun versi nirvana rancak gak slow rockkk
5,not sun lady gaga
6,cute
7,today international day elimination violence w...
8,game april david wright go for-5 hr bb r monda...
9,josh hamilton fly center go 9th tie


#### (2). vectorization (transformation)

In [6]:


# bag of words
# countvectorizer
BoW_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2))
X_train_BoW = BoW_vectorizer.fit_transform(X_train_clean)
X_test_BoW = BoW_vectorizer.transform(X_test_clean)

In [7]:
# 3. feature selection
X_train_new = SelectKBest(chi2,k=5000).fit_transform(X_train_BoW,Y_train)


In [8]:
# 4. split
train_size = X_train_BoW.shape[0]
test_size = X_test_BoW.shape[0]
## random hold out
ts = test_size/train_size
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train_new,Y_train, test_size=ts)

# modelling

## base model: 0R

In [9]:
clf = DummyClassifier(strategy='most_frequent')
basemodel = clf.fit(X_train_raw, Y_train)
print("base model score: ", basemodel.score(X_train_raw, Y_train))

base model score:  0.5806348041464086


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## logistic regression

### choose of hyperparameter

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

hyper = {
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'none', 'elasticnet'],
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
    'max_iter': [100, 200, 300],
    'multi_class': ['auto', 'ovr', 'multinomial']
}

##### randomised method (less computation)

In [16]:
search_logi = RandomizedSearchCV(LogisticRegression(),hyper, scoring='accuracy', cv=5, n_iter=5)

##### grid search

In [None]:
search_logi = GridSearchCV(LogisticRegression(),hyper, scoring='accuracy', cv=5)

##### evaluation

In [17]:
logi_result = search_logi.fit(X_train_s, y_train_s)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
pd.DataFrame(logi_result.cv_results_)
logi_result.best_params_
logi_result.best_score_

0.6752853263900296

In [22]:
logi_result.best_params_['solver']

'lbfgs'

### SVM

In [None]:
from sklearn.svm import SVC

print(f"svm model score: ", svm_model.score(X_test_s,y_test_s))
svm_hyper = {
    'degree': [3, 5, 10, 15],
    'gamma': [1,0.1,0.01,0.001],
    'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmod'],
    'max_iter': [-1, 100, 500, 1000],
    'decision_function_shape': ['ovo', 'ovr']
}

In [None]:
# random method
search_svm = RandomizedSearchCV(SVC(), svm_hyper, scoring='accuracy', cv=5, n_iter=100)

In [None]:
# grid method
search_svm = GridSearchCV(SVC(), svm_hyper, scoring='accuracy', cv=5)

In [None]:
svm_result = search_svm.fit(X_train_s, y_train_s)
pd.DataFrame(svm_result.cv_results_)
svm_result.best_params_
svm_result.best_score_

### Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_hyper = {
    'n_estimators': [90, 100, 115 , 130],
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,20,1),
    'min_sample_leaf': range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features': ['auto', 'log2']
}

In [None]:
# random method
search_svm = RandomizedSearchCV(RandomForestClassifier(), rf_hyper, scoring='accuracy', cv=5, n_iter=100)

In [None]:
# grid method 
search_svm = GridSearchCV(RandomForestClassifier(), rf_hyper, scoring='accuracy', cv=5)

In [None]:
rf_result = search_svm.fit(X_train_s, y_train_s)
pd.DataFrame(rf_result.cv_results_)
rf_result.best_params_
rf_result.best_score_

### Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier

rf_model = RandomForestClassifier(criterion=rf_result.best_params_['criterion'], max_depth=rf_result.best_params_['max_depth'],
                                  max_features='log2', min_samples_leaf=rf_result.best_params_['min_sample_leaf'], 
                                  min_samples_split=rf_result.best_params_['min_samples_split'], n_estimators=rf_result.best_params_['n_estimators'],
                                  random_state=0).fit(X_train_s, y_train_s)

svm_model = SVC(degree=svm_result.best_params_['degree'], gamma=svm_result.best_params_['gamma'] ,C=svm_result.best_params_['C'], 
                kernel=svm_result.best_params_['kernel'], max_iter=svm_result.best_params_['max_iter'], 
                decision_function_shape=svm_result.best_params_['decision_function_shape']).fit(X_train_s, y_train_s)

logi_model = LogisticRegression(solver=logi_result.best_params_['solver'], penalty=logi_result.best_params_['penalty'],
                                C=logi_result.best_params_['C'], max_iter=logi_result.best_params_['max_iter'],
                                multi_class=logi_result.best_params_['multi_class']).fit(X_train_s, y_train_s)

estimators = [('rf', rf_model),('svr', svm_model), ('log', logi_model)]

In [None]:
dt_stacking = StackingClassifier(estimator=estimators, final_estimator=DecisionTreeClassifier())

In [None]:
svm_stacking = StackingClassifier(estimator=estimators, final_estimator=SVC())