## 2. Sentiment Analysis - Modeling
---

In [152]:
#importing libraries 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier


## Modeling
---

In this segment, we will use the following models to classify the tweets. 

##### Models Used: 
1. Logistic Regression Model (??)
2. Multinomial Naive Bayes Model 
3. Decision Tree
3. Use of Grid Search to optimise the number of features in the count vectoriser in an attempt to improve model accuracy

##### Metric to Validate Model: 

Accuracy is likely the best metric to use here as improperly classifying a subreddit post is equally bad in this instance.

##### Outcome:

The Multinomial Naive Bayes Model was able to accurately classify 91.8% of the posts. 

In [153]:
filepath = '../datasets/tweets_clean_1.csv'

In [154]:
df_1 = pd.read_csv(filepath)

In [155]:
df_1.shape

(14601, 2)

In [156]:
df_1.head()

Unnamed: 0,text,airline_sentiment
0,said,1
1,plu ad commerci experi tacki,2
2,today must mean need take anoth trip,1
3,realli aggress blast obnoxi entertain guest fa...,0
4,realli big bad thing,0


In [157]:
df_1['text'].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
14596    False
14597    False
14598    False
14599    False
14600    False
Name: text, Length: 14601, dtype: bool

In [158]:
null = df_1['text'].isnull()

In [159]:
df_1[null]

Unnamed: 0,text,airline_sentiment


### Modeling 

- Split data into X and y 
- Train test split for model validation

##### Split data into `X` and `y`.

In [160]:
X = df_1['text']
y = df_1['airline_sentiment']

In [161]:
X.shape

(14601,)

In [162]:
X.head()

0                                                 said
1                         plu ad commerci experi tacki
2                 today must mean need take anoth trip
3    realli aggress blast obnoxi entertain guest fa...
4                                 realli big bad thing
Name: text, dtype: object

In [163]:
X.shape

(14601,)

##### Split data into test and train set

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify = y)

In [165]:
X_train.shape

(10950,)

In [166]:
X_train.isnull().sum()

0

In [167]:
y_train.shape

(10950,)

In [168]:
X_test.shape

(3651,)

In [109]:
# cv = CountVectorizer()

In [116]:
# cv_train = cv.fit_transform(X_train)

In [117]:
# tfidf = TfidfTransformer()

In [125]:
# tf_train = tfidf.transform(cv_train)

In [126]:
# lr = LogisticRegression()

In [127]:
# lr_train = lr.fit(tf_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Logistic Regression : Hyperparameter Optimisation

Approach:
- Created a pipeline to find optimal parameters for Count Vectorizer, TFIDF and model hyperparameters

In [145]:
pipeline = Pipeline([
        ('cvec', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('lr', LogisticRegression())
    ])

In [148]:
parameters = {
    'cvec__max_df': (.9, .95), # Ignore words more than x% 
    'cvec__max_features': (2500, 5000, 10000),
    'cvec__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'lr__penalty': ('none','l2'),
    'lr__max_iter':[1000]
}


In [149]:
gs = GridSearchCV(estimator = pipeline, 
                    param_grid = parameters,
                    verbose = 1,
                    n_jobs = -1, 
                    cv = 5)

In [150]:
gs.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  6.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [178]:
print(f"Best Score: {gs.best_score_}")
print(f"Best Parameters: ")
gs.best_params_

Best Score: 0.7764383561643836
Best Parameters: 


{'cvec__max_df': 0.9,
 'cvec__max_features': 5000,
 'cvec__ngram_range': (1, 2),
 'lr__max_iter': 1000,
 'lr__penalty': 'l2',
 'tfidf__use_idf': False}

In [180]:
# Retrain model using best parameters with entire train set
pipeline_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression())
])
pipeline_lr.set_params(**gs.best_params_)
lr_model = pipeline_lr.fit(X_train,y_train)
print('Final model score')
print()
print(f'Train score: {lr_model.score(X_train,y_train)}') 
print(f'Test score: {lr_model.score(X_test,y_test)}')

Final model score

Train score: 0.848310502283105
Test score: 0.7825253355245139


### Naive Bayes : Hyperparameter Optimisation

Approach:
- Created a pipeline to find optimal parameters for Count Vectorizer, TFIDF and model hyperparameters

In [71]:

def classify_subreddits(X, y, model='lr'):
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)
    models = {
        'lr': LogisticRegression(solver='lbfgs'),
        'knn': KNeighborsClassifier(),
        'nb': MultinomialNB(),
        'rf': RandomForestClassifier(),
        'et': ExtraTreesClassifier(),
        'ada': AdaBoostClassifier(),
        'gb': GradientBoostingClassifier()
    }
    
    # Building a pipeline
    pipeline = Pipeline([
        ('cvec', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        (model, models[model])
    ])
    
    # Setting base pipeline parameters
    parameters = {
#         'cvec__max_df': (0.5, 1.0), # Ignore words more than x% 
        'cvec__max_features': (2500, 5000, 10000),
        'cvec__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        'tfidf__use_idf': (True, False)
    }
    
    # Additional parameters for different models
    if model == 'lr':
        parameters.update({'lr__penalty': ('none','l2')})
    elif model == 'knn':
        parameters.update({
            'knn__n_neighbors':[3,5,11],
            'knn__weights':['uniform','distance'],
#             'knn__metric':['euclidean','manhattan']
        })
    elif model == 'nb':
        parameters.update({'nb__alpha':(1,2)})
    
    elif model == 'rf':
        parameters.update({
            'rf__n_estimators': [100,150],
            'rf__max_depth': [1,2,None]
        })
    elif model == 'et':
        parameters.update({
            'et__n_estimators': [50,100,150]
        })
    elif model == 'ada':
        parameters.update({
            'ada__n_estimators': [50,75,100]
        })
        
    # Perfrom grid search
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    print()
    grid_search.fit(X_train, y_train)
    
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print()  
    
    # Retrain model using best parameters with entire train set
    pipeline_final = Pipeline([
        ('cvec', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        (model, models[model])
    ])
    pipeline_final.set_params(**grid_search.best_params_)
    final_model = pipeline.fit(X_train,y_train)
    print('Final model score')
    print()
    print(f'Train score: {final_model.score(X_train,y_train)}') 
    print(f'Test score: {final_model.score(X_test,y_test)}')
    
    predictions = final_model.predict(X_test)
    confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print()
    print("True Negatives: %s" % tn)
    print("False Positives: %s" % fp)
    print("False Negatives: %s" % fn)
    print("True Positives: %s" % tp)
    return X_test,y_test, final_model.predict(X_test)

In [72]:
def run_all_models(X,y):
    models = {
    'lr': 'Logistic Regresson',
    'knn': 'K Nearest Neighbours',
    'nb': 'Naive Bayes',
    'rf': 'Random Forest',
    'et': 'Extra Trees',
    'ada': 'Ada Boost',
    }

    models_pred = {m:None for m in models}

    for model in models:
        print(f'----------------------- {models[model]} ------------------------')
        models_pred['text'],models_pred['label'],models_pred[model] = classify_subreddits(X, y, model=model)
        print()

    cols = ['text','lr','knn','nb','rf','et','ada','label']
    model_df = pd.DataFrame(models_pred)[cols]
    return model_df

In [73]:
ds_lp_df = run_all_models(X,y)

----------------------- Logistic Regresson ------------------------
Performing grid search...
pipeline: ['cvec', 'tfidf', 'lr']
parameters:
{'cvec__max_features': (2500, 5000, 10000), 'cvec__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'lr__penalty': ('none', 'l2')}

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.8s finished


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [26]:
#creating a pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression())
])

In [27]:
pipe_params = {
    'cvec__max_features':[2500,3500,4000,5000],
    'cvec__min_df':[2,3],
    'cvec__max_df':[.9,.95],
    'cvec__ngram_range':[(1,1),(1,2)],
    'tfidf__use_idf':[True,False],
    'lr__penalty':['none','l2']
    }

gs = GridSearchCV(estimator = pipe, 
                  param_grid = pipe_params,
                  verbose = 1,
                  n_jobs = -1,
                  cv = 5)

gs.fit(X_train,y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Found input variables with inconsistent numbers of samples: [1, 8760]

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.4s finished


ValueError: Found input variables with inconsistent numbers of samples: [1, 10950]

In [24]:
def lr_model(X_train, X_test, y_train, y_test):
    
    #creating a pipeline
    pipe = Pipeline([
        ('cvec', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('lr', LogisticRegression())
    ])
    
    pipe_params = {
    'cvec__max_features':[2500,3500,4000,5000],
#     'cvec__min_df':[2,3],
    'cvec__max_df':[.9,.95],
    'cvec__ngram_range':[(1,1),(1,2)],
    'tfidf__use_idf':[True,False],
    'lr__penalty':['l1','l2']
    
    }

    #grid search 
    gs = GridSearchCV(estimator = pipe, 
                        param_grid = pipe_params,
                        verbose = 1,
                        n_jobs = -1, 
                        cv = 5)
    
    print("Grid Search In Progress")
    print('===========================================================================')
    print(f"Parameters : {pipe_params}")
    
    gs.fit(X_train,y_train)

    print(f"Best Score: {gs.best_score_}")
    print(f"Best Parameters: {gs.best_params}")
    
    print("Final Model Score")
    final_model = gs.refit(X_train,y_train)
    print(f'Train Score:{final.model.score(X_train,y_train)}')
    print(f'Ttest Score:{final.model.score(X_test,y_test)}')

In [18]:
lr_model(X_train, X_test, y_train, y_test)

NameError: name 'lr_model' is not defined

In [16]:
# def run_model(model_type, X_train, X_test, y_train, y_test):
    
#     #specifying models
#     models = {'lr': LogisticRegression(), 
#               'nb': MultinomialNB(),
#               'dt': DecisionTreeClassifier(),
#               'rf': RandomForestClassifier()
#              }
    
#     #creating a pipeline
#     pipe = Pipeline([
#         ('cvec', CountVectorizer()),
#         ('tfidf', TfidfTransformer()),
#         (model_type, models[model_type])
#     ])
    
#     pipe_params = {
#     'cvec__max_features':[2500,3500,4000,5000],
#     'cvec__min_df':[2,3],
#     'cvec__max_df':[.9,.95],
#     'cvec__ngram_range':[(1,1),(1,2)],
#     'tfidf__use':[True,False]
    
#     }
        
#     #additional parameters for each model 
#     if model_type == 'nb':
#         pipe_params.update({'nb_alpha':np.logspace(-5,2,100)})
        
#     elif model_type == 'lr':
#         pipe_params.update({'penalty':['l1','l2']
# #                             'lr_alpha':np.logspace(-5,2,100)
#                            })
        
#     elif model_type == 'dt':
#         pipe_params.update({'max_depth':[3,5,7,10],
#                            'min_samples_split':[5,10,15,20],
#                            'min_samples_leaf':[2,3,4,5,6,7]})
    
#     elif model_type == 'rf':
#         pipe_params.update({'n_estimators': [100,150,200],
#                             'max_depth':[None,1,2,3,4,5]})

#     #grid search 
#     gs = GridSearchCV(estimator = pipe, 
#                         param_grid = pipe_params,
#                         verbose = 1,
#                         n_jobs = -1, 
#                         cv = 5)
    
#     print("Grid Search for " + model_type)
#     print('===========================================================================')
# #     print(f"Parameters : {pipe_params}")
    
#     gs.fit(X_train,y_train)

#     print(f"Best Score: {gs.best_score_}")
#     print(f"Best Parameters: {gs.best_params}")
    
#     print("Final Model Score")
#     final_model = gs.refit(X_train,y_train)
#     print(f'Train Score:{final.model.score(X_train,y_train)}')
#     print(f'Ttest Score:{final.model.score(X_test,y_test)}')

In [17]:
# run_model('lr',X_train, X_test, y_train, y_test)

Grid Search for lr
Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


ValueError: Invalid parameter penalty for estimator Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [19]:
# def model(model_type, X_train, X_test, y_train, y_test):
#     if model_type == 'Multinomial NB':
#         nb = MultinomialNB()
#         model = nb.fit(X_train,y_train)
        
#     if model_type == 'Decision Tree':
#         dt = DecisionTreeClassifier()
#         model = dt.fit(X_train,y_train)
        
#     if model_type == 'Random Forest':
#         rf = RandomForestClassifier()
#         model = rf.fit(X_train,y_train) 
    
#     print(model_type)
#     print('===========================================')
#     print('Train Score:'+ model.score(X_train, y_train))
#     print('Test Score:' + model.score(X_train, y_train))
    

In [8]:
models = {'lr': 1}

In [9]:
models['lr']

1

In [None]:
# def run_model(model_type, X_train, X_test, y_train, y_test):
    
#     #specifying models
#     models = {'lr': LogisticRegression(), 
#               'nb': MultinomialNB(),
#               'dt': DecisionTreeClassifier(),
#               'rf': RandomForestClassifier()
#              }
    
#     #creating a pipeline
#     pipe = Pipeline([
#         ('cvec', CountVectorizer()),
#         ('tfidf', TfidfTransformer())
#         (model_type, models[model_type])
#     ])
    
#     pipe_params = {
#     'cvec__max_features':[2500,3500,4000,5000]
#     'cvec__min_df':[2,3],
#     'cvec__max_df':[.9,.95],
#     'cvec__ngram_range':[(1,1),(1,2)],
#     'tfidf__use':[True,False]
    
#     if model_type == 'nb':
#         pipe_params.update({'nb_alpha':np.logspace(-5,2,100)})
        
#     if model_type == 'lr':
#         pipe_params.update({'penalty':['l1','l2'],
#                             'C':np.logspace(-5,2,100)})
        
#     if model_type == 'dt':
#         pipe_params.update({'max_depth':[3,5,7,10],
#                            'min_samples_split':[5,10,15,20],
#                            'min_samples_leaf':[2,3,4,5,6,7]})
    
#     if model_type == 'rf':
#         pipe_params.update({'n_estimators': [100,150,200],
#                             'max_depth':[None,1,2,3,4,5]})
# }