# 

In [2]:
!pip install comet_ml
!pip install tweet-preprocessor
!pip install nlppreprocess

Collecting comet_ml
  Downloading comet_ml-3.23.0-py2.py3-none-any.whl (308 kB)
[?25l[K     |█                               | 10 kB 18.0 MB/s eta 0:00:01[K     |██▏                             | 20 kB 23.1 MB/s eta 0:00:01[K     |███▏                            | 30 kB 26.7 MB/s eta 0:00:01[K     |████▎                           | 40 kB 28.3 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 30.0 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 30.9 MB/s eta 0:00:01[K     |███████▌                        | 71 kB 25.2 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 26.1 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 25.3 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 26.2 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 26.2 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 26.2 MB/s eta 0:00:01[K     |█████████████▉                  | 133 kB 26.2 MB/

In [3]:
#Data preprocessing libraries
import numpy as np
import pandas as pd

#Data Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('sample_data/train.csv', encoding='UTF-8')
test = pd.read_csv('sample_data/test_with_no_labels.csv' ,encoding='UTF-8')

#Insurance Dataset
data_copy = data.copy()

In [5]:
import preprocessor as p

In [6]:
#Clean data using tweet preprocessor
def _clean_tweet(data):
  return p.clean(data)
#Apply the function to the dataset
data_copy['clean_tweets'] = data_copy['message'].apply(_clean_tweet)

In [7]:
import re
import string

In [8]:
def _remove_punctuation_numbers(data):
   punc_numbers = string.punctuation + '0123456789'
   return ''.join([l for l in data if l not in punc_numbers])
data_copy['clean_punc'] = data_copy['clean_tweets'].apply(_remove_punctuation_numbers)

In [9]:
def _characters (data):
  data = re.sub('\\n' , '' , data) #Remove any new lines
  data = re.sub(r'[^\x00-\x7f]',r'', data)
  return data
data_copy['clean_char'] = data_copy['clean_punc'].apply(_characters)


In [10]:
def _lower(data):
  return data.lower()
data_copy['lower'] = data_copy['clean_punc'].apply(_lower)

In [11]:
import nltk
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
from nlppreprocess import NLP
nlp = NLP()
nlp.process('couldnt')

'could not'

In [13]:
#Remove Stopwords
def remove_stopwords(data):
    """
    Remove stop-word in the dataset to reduce noise
    Args:
        Args:
        data: pandas dataframe
    Return:
        Dataframe:non-stop word
    """
    stopwords = NLP(replace_words=True, remove_stopwords=True, 
                            remove_numbers=True, remove_punctuations=False) 
    data = stopwords.process(data)
    return data
    
data_copy['Tweet_nonstop'] = data_copy['lower'].apply(lambda x: remove_stopwords(x))

In [14]:
#Tokenization
def tokenization(data):
    data = re.split('\W+', data)
    return data
data_copy['Tweet_tokenized'] = data_copy['Tweet_nonstop'].apply(lambda x: tokenization(x))

In [15]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
nltk.download('wordnet')
lem = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [16]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [17]:
def lemmatizer(data):
    pos = pos_tag(data)
    data = ' '.join([lem.lemmatize(word, po[0].lower()) 
                      if (po[0].lower() in ['n', 'r', 'v', 'a'] and word[0] != '@') else word for word, po in pos])
    return data
data_copy['lemmatized'] = data_copy['Tweet_tokenized'].apply(lambda x: lemmatizer(x))

In [18]:
def _analyzer (data):
    data  = _clean_tweet(data)
    data = _remove_punctuation_numbers(data)
    data = _characters(data)
    data = _lower(data)
    data = remove_stopwords(data)
    data = tokenization(data)
    data = lemmatizer(data)
    return data

In [74]:
# data['cleaned'] = data['message'].apply(lambda x: _analyzer(x))
# test['cleaned'] = test['message'].apply(lambda x: _analyzer(x))

In [19]:
data_copy.head()

Unnamed: 0,sentiment,message,tweetid,clean_tweets,clean_punc,clean_char,lower,Tweet_nonstop,Tweet_tokenized,lemmatized
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,PolySciMajor EPA chief doesn't think carbon di...,PolySciMajor EPA chief doesnt think carbon dio...,PolySciMajor EPA chief doesnt think carbon dio...,polyscimajor epa chief doesnt think carbon dio...,polyscimajor epa chief not think carbon dioxid...,"[polyscimajor, epa, chief, not, think, carbon,...",polyscimajor epa chief not think carbon dioxid...
1,1,It's not like we lack evidence of anthropogeni...,126103,It's not like we lack evidence of anthropogeni...,Its not like we lack evidence of anthropogenic...,Its not like we lack evidence of anthropogenic...,its not like we lack evidence of anthropogenic...,its not like we lack evidence anthropogenic gl...,"[its, not, like, we, lack, evidence, anthropog...",its not like we lack evidence anthropogenic gl...
2,2,RT @RawStory: Researchers say we have three ye...,698562,: Researchers say we have three years to act o...,Researchers say we have three years to act on...,Researchers say we have three years to act on...,researchers say we have three years to act on...,researchers say we three years act climate cha...,"[researchers, say, we, three, years, act, clim...",researcher say we three year act climate chang...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,WIRED : was a pivotal year in the war on clima...,WIRED was a pivotal year in the war on climat...,WIRED was a pivotal year in the war on climat...,wired was a pivotal year in the war on climat...,wired pivotal year in war climate change,"[wired, pivotal, year, in, war, climate, change]",wire pivotal year in war climate change
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,": It's , and a racist, sexist, climate change ...",Its and a racist sexist climate change denyi...,Its and a racist sexist climate change denyi...,its and a racist sexist climate change denyi...,its and racist sexist climate change denying b...,"[its, and, racist, sexist, climate, change, de...",its and racist sexist climate change deny bigo...


In [None]:
# tfid_vec = TfidfVectorizer(use_idf=True,min_df= .01 , max_df=0.95 , ngram_range=(1, 2), analyzer='word')
# tfid_Vectorized = tfid_vec.fit_transform(data['cleaned']).toarray()

In [20]:
from sklearn.utils import resample
class_size = int(len(data[data['sentiment']==1])/2)

In [21]:
class_1 = data[data['sentiment']==-1]
class_2 = data[data['sentiment']==0]
class_3 = data[data['sentiment']==1]
class_4 = data[data['sentiment']==2]

In [22]:
# upsampling classes 1, 2, and 4 & downsampling class 3
class_1_up = resample(class_1,replace=True,n_samples=class_size, random_state=27)
class_2_up = resample(class_2,replace=True,n_samples=class_size, random_state=27)
class_4_up = resample(class_4,replace=True,n_samples=class_size, random_state=27)
class_3_down = resample(class_3,replace=False,n_samples=class_size, random_state=27)

In [23]:
# Creating a new DataFrame out of the balanced bata
resampled = pd.concat([class_1_up, class_2_up, class_4_up,class_3_down])

In [24]:
X = resampled['message'].apply(lambda x: _analyzer(x))
y = resampled['sentiment']

In [25]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.2, 
                                                       random_state=42)

In [26]:
# Models
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [27]:
classifiers = [LogisticRegression(random_state=42), SVC(kernel='poly', random_state = 42), 
               SVC(kernel = 'rbf', random_state = 42),MultinomialNB(),RidgeClassifier(),
               LinearSVC(random_state=42), SGDClassifier(random_state=42), RandomForestClassifier(random_state=42)]

In [28]:
def _performace_assesment(*args , **kwargs):
  model_stats = {}
  for clf in classifiers:
    model = Pipeline([('count_vec', TfidfVectorizer(lowercase = True, ngram_range=(1, 2), analyzer='word')),
                      ('clf' , clf)
                      ])
    
    model.fit(X_train, y_train) #Training
    model_pred = model.predict(X_test) #Testing

    # Dictionary of Models Performances
    model_stats[clf.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, model_pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, model_pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, model_pred, average='weighted')}
  return pd.DataFrame.from_dict(model_stats, orient='index')


In [92]:
performance = _performace_assesment(classifiers , X_train , X_test , y_train , y_test)
performance.to_csv('performance.csv')
dataframe = pd.read_csv('performance.csv', index_col = 0)
dataframe.sort_values('F1-Weighted', ascending=False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
SVC,0.876726,0.876905,0.876726
LinearSVC,0.857236,0.860199,0.857236
RidgeClassifier,0.853112,0.855803,0.853112
SGDClassifier,0.845635,0.849062,0.845635
LogisticRegression,0.834257,0.837632,0.834257
RandomForestClassifier,0.83399,0.838511,0.83399
MultinomialNB,0.807196,0.812134,0.807196


In [93]:
def _param_tuning(*args , **kwargs):
  best_params = {}

  for clf in classifiers:
    model = Pipeline([('count_vec', TfidfVectorizer(lowercase = True, ngram_range=(1, 2), analyzer='word')),
                      ('clf' , clf)
                      ])
    model.fit(X_train, y_train) #Training
    
    #Get models performing parameters
    params = model.get_params()
    model_name = clf.__class__.__name__ 
    model_name = {}
    for key in params:
      if key.startswith("clf"):
        if len(key) < 5:
          model_name['model'] = params[key]
        else:
            model_name[key[5:]] = params[key]
    best_params[clf.__class__.__name__] = model_name
  return best_params

In [94]:
best_params = _param_tuning(classifiers, X_train, y_train)

In [95]:
#Best parameters
best_params

{'LinearSVC': {'C': 1.0,
  'class_weight': None,
  'dual': True,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'loss': 'squared_hinge',
  'max_iter': 1000,
  'model': LinearSVC(random_state=42),
  'multi_class': 'ovr',
  'penalty': 'l2',
  'random_state': 42,
  'tol': 0.0001,
  'verbose': 0},
 'LogisticRegression': {'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 100,
  'model': LogisticRegression(random_state=42),
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 42,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'MultinomialNB': {'alpha': 1.0,
  'class_prior': None,
  'fit_prior': True,
  'model': MultinomialNB()},
 'RandomForestClassifier': {'bootstrap': True,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': 'auto',
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_i

In [113]:
model = SVC(kernel='poly', random_state = 42)

In [30]:
Vectorize = TfidfVectorizer(lowercase = True, ngram_range=(1, 2), analyzer='word')
X_train = Vectorize.fit_transform(X_train)
X_test = Vectorize.transform(X_test)

In [115]:
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True,
                                   random_state=42)

In [116]:
best_params[classifiers[2].__class__.__name__]

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'model': SVC(random_state=42),
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [117]:
# param_grid = {
#     "average": [True, False],
#     "l1_ratio": np.linspace(0, 1, num=10),
#     "alpha": np.power(10, np.arange(-4, 1, dtype=float)),
# }

param_grid = {'kernel': ('linear', 'rbf'),'C': [1, 10, 100]}
grid_search = GridSearchCV(estimator= model,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [118]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [112]:
grid_search.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__C', 'estimator__class_weight', 'estimator__dual', 'estimator__fit_intercept', 'estimator__intercept_scaling', 'estimator__l1_ratio', 'estimator__max_iter', 'estimator__multi_class', 'estimator__n_jobs', 'estimator__penalty', 'estimator__random_state', 'estimator__solver', 'estimator__tol', 'estimator__verbose', 'estimator__warm_start', 'estimator', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [119]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.882580752249323
Test score: 0.8803448226416891


SVC(C=100, random_state=42)

In [40]:
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.2, 
                                                       random_state=42)

In [41]:
Vectorize = TfidfVectorizer(lowercase = True, ngram_range=(1, 2), analyzer='word')
X_train = Vectorize.fit_transform(X_train)
X_test = Vectorize.transform(X_test)

In [42]:
log_sgd1 = LogisticRegression(C=5)
log_sgd2 = LogisticRegression(C=5)

In [43]:
estimators = [('log_sgd1', log_sgd1), ('log_sgd2', log_sgd2)]
final_est = RidgeClassifier(alpha=0.2125)

In [44]:
from sklearn.ensemble import StackingClassifier

In [45]:
stacking_log2 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)

In [46]:
stacking_log2.fit(X_train , y_train)

StackingClassifier(estimators=[('log_sgd1', LogisticRegression(C=5)),
                               ('log_sgd2', LogisticRegression(C=5))],
                   final_estimator=RidgeClassifier(alpha=0.2125),
                   passthrough=True)

In [47]:
pred = stacking_log2.predict(X_test)

In [48]:
model_stats = {}
model_stats[stacking_log2.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
StackingClassifier,0.846806,0.849941,0.846806


In [31]:
X = test['message'].apply(lambda x: _analyzer(x))
Vector = Vectorize.transform(X)

In [32]:
model = SVC(C=100, random_state=42).fit(X_train , y_train)

In [33]:
pred = model.predict(Vector)

In [34]:
daf = pd.DataFrame(pred, columns=['sentiment'])
daf.head(20)

Unnamed: 0,sentiment
0,1
1,1
2,1
3,1
4,0
5,2
6,1
7,1
8,1
9,2


In [35]:
output = pd.DataFrame({"tweetid":test['tweetid']})
submission = output.join(daf)        
submission.to_csv("submission3.csv", index=False)
submission

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,0
...,...,...
10541,895714,0
10542,875167,1
10543,78329,0
10544,867455,0
