In [3]:
import time
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, r2_score, classification_report, confusion_matrix
from sklearn.model_selection import (train_test_split, learning_curve, cross_val_score, cross_val_predict,
                                     ShuffleSplit, KFold, GridSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import spacy
import re
import string
from sklearn.svm import SVC
from sklearn import svm
from nltk.stem import WordNetLemmatizer
import nltk

lemmatizer = WordNetLemmatizer()
nltk.download("wordnet")

nlp = spacy.load('en_core_web_sm')
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) # 1: unreliable # 0: reliable
train = pd.read_csv(r'TrainDataset.csv', sep=",", engine='python')
print("Old shape of train:", train.shape)
train = train.dropna()

def clean_data(dataframe):
   # Remove punctation
    dataframe['text'] = dataframe['text'].str.replace('[^\w\s]',' ')
    # Remove numbers 
    dataframe['text'] = dataframe['text'].str.replace('[^A-Za-z]',' ')
    # Make sure any double-spaces are single 
    dataframe['text'] = dataframe['text'].str.replace('  ',' ')
    dataframe['text'] = dataframe['text'].str.replace('  ',' ')
    # Transform all text to lowercase
    dataframe['text'] = dataframe['text'].str.lower()   
    dataframe.drop_duplicates(subset=['text'], inplace=True)
    print("New shape:", dataframe.shape)
    return dataframe

traindata = clean_data(train)
dataframe = traindata
def word_extraction(sentence, vocab):
    global tokens
    ignore_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',
                    'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
                    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
                    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                    'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
                    'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
                    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
                    'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
                    'all', 'any', 'both', 'each', 'other', 'some', 'such', 'no', 'nor',
                    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
                    'should', 'now', 'uses', 'use', 'using', 'used', 'one', 'also']
    # split into tokens by white space
    tokens = re.sub('[^\w]', " ", sentence).split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    tokens = [w.lower() for w in tokens if w not in STOPLIST]
    tokens = [w.lower() for w in tokens if w not in ignore_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    tokens = [word for word in tokens if len(word) > 2]
    count = vocab.update(tokens)
    return tokens, count

def generate_bow(allsentences, vocab):
    bow = []
    for sentence in allsentences:
        words = word_extraction(sentence, vocab)
        # bag_vector = numpy.zeros(len(words))
        # for w in words:
        #     for i,word in enumerate(words):
        #         if word == w:
        #             bag_vector[i] += 1

        bow.append(words)
    return bow

vocab = Counter()
data_set = generate_bow(dataframe["text"], vocab)

[nltk_data] Downloading package wordnet to /Users/mikesam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Old shape of train: (10000, 5)
New shape: (8657, 5)


In [47]:
# data_set

In [40]:
print(len(vocab))

print(vocab.most_common(150))

89930
[('said', 37335), ('trump', 25083), ('state', 17726), ('people', 16685), ('year', 15057), ('time', 13799), ('new', 13438), ('like', 12305), ('clinton', 12086), ('president', 11777), ('american', 10183), ('government', 8344), ('day', 7979), ('country', 7895), ('say', 7777), ('world', 7161), ('united', 6997), ('right', 6858), ('make', 6829), ('news', 6828), ('obama', 6744), ('way', 6527), ('hillary', 6422), ('election', 6315), ('campaign', 6061), ('know', 5958), ('house', 5782), ('republican', 5780), ('woman', 5766), ('white', 5637), ('official', 5629), ('think', 5622), ('going', 5568), ('want', 5549), ('group', 5468), ('war', 5467), ('life', 5374), ('work', 5340), ('medium', 5215), ('law', 5177), ('city', 5063), ('political', 5046), ('thing', 5020), ('week', 5014), ('million', 4959), ('company', 4881), ('party', 4862), ('come', 4840), ('national', 4800), ('public', 4653), ('case', 4588), ('told', 4581), ('donald', 4483), ('email', 4431), ('need', 4408), ('percent', 4392), ('accord

In [41]:
df = pd.DataFrame(data_set)
df['News'] = df[df.columns[0:]].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
df["Type"] = dataframe["label"]
# duplicate_word_list = [word for word, count in Counter(df["News"]).most_common() if count > 1]
dfModel = pd.DataFrame(df[["News", "Type"]])
dfModel = dfModel.dropna()

X = dfModel.News.values
y = dfModel.Type.values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=0.3)

In [42]:
start = time.time()
pipe = Pipeline([('cvec', CountVectorizer(stop_words=ENGLISH_STOP_WORDS)),
                 ('nb', MultinomialNB())])

# Tune GridSearchCV, So, the smaller the value of alpha(hyper parameter), the higher would be the magnitude of the coefficients.
pipe_params = {'cvec__ngram_range': [(1, 2)], 'nb__alpha': [20]}

# kf = KFold(n_splits=5, shuffle=True)
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)
gs.fit(X_train, y_train)

# best_score_ is the 'Mean cross-validated score of the best_estimator.
# best_score is a measure that incorporates how your model performs in models that it has not seen.
print("Train score", gs.score(X_train, y_train))
print("Validation Score", gs.best_score_)
print("Predicted Test score", gs.score(X_test, y_test))
# print(gs.best_params_)
print()
y_true, y_pred = y_test, gs.predict(X_test)
print(classification_report(y_true, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())
end = time.time()
print("execution time is", end - start)

Train score 0.585096245473604
Validation Score 0.5780445969125214
Predicted Test score 0.5777777777777777
{'cvec__ngram_range': (1, 2), 'nb__alpha': 20}

              precision    recall  f1-score   support

         0.0       0.58      1.00      0.73      1301
         1.0       0.00      0.00      0.00       949

   micro avg       0.58      0.58      0.58      2250
   macro avg       0.29      0.50      0.37      2250
weighted avg       0.33      0.58      0.42      2250

Misclassified samples: 950


AttributeError: 'numpy.float64' object has no attribute 'lower'

In [43]:
start = time.time()

pipe = Pipeline([('cvec', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
                 ('nb', MultinomialNB())])

# Tune GridSearchCV, So, the smaller the value of alpha(hyper parameter), the higher would be the magnitude of the 
# coefficients.
pipe_params = {'cvec__ngram_range': [(1, 2)], 'nb__alpha': [20]}

# kf = KFold(n_splits=5, shuffle=True)
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)
gs.fit(X_train, y_train)

# best_score_ is the 'Mean cross-validated score of the best_estimator.
# best_score is a measure that incorporates how your model performs in models that it has not seen.
print("Train score", gs.score(X_train, y_train))
print("Validation Score", gs.best_score_)
print("Predicted Test score", gs.score(X_test, y_test) * 100)
# print(gs.best_params_)
print()

y_true, y_pred = y_test, gs.predict(X_test)
print(classification_report(y_true, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())
end = time.time()
print("execution time is", end - start)

Train score 0.5782351820087669
Validation Score 0.5782351820087669
Predicted Test score 57.82222222222222
{'cvec__ngram_range': (1, 2), 'nb__alpha': 20}

              precision    recall  f1-score   support

         0.0       0.58      1.00      0.73      1301
         1.0       0.00      0.00      0.00       949

   micro avg       0.58      0.58      0.58      2250
   macro avg       0.29      0.50      0.37      2250
weighted avg       0.33      0.58      0.42      2250

Misclassified samples: 949
execution time is 968.8343148231506


  'precision', 'predicted', average, warn_for)


In [None]:
start = time.time()

pipe = Pipeline([('cvec', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
                 ('nb', svm.SVC(kernel='rbf'))])

# Tune GridSearchCV, So, the smaller the value of alpha(hyper parameter), the higher would be the magnitude of the coefficients.
pipe_params = {'cvec__ngram_range': [(1, 2)], 'nb__gamma':['scale']}

# kf = KFold(n_splits=5, shuffle=True)
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)
gs.fit(X_train, y_train)

# best_score_ is the 'Mean cross-validated score of the best_estimator.
# best_score is a measure that incorporates how your model performs in models that it has not seen.
print("Train score", gs.score(X_train, y_train))
print("Validation Score", gs.best_score_)
print("Predicted Test score", gs.score(X_test, y_test) * 100)
# print(gs.best_params_)
print()

y_true, y_pred = y_test, gs.predict(X_test)
print(classification_report(y_true, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())
end = time.time()
print("execution time is", end - start)

In [31]:
start = time.time()
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([('cvec', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
                 ('logit', LogisticRegression(solver='liblinear'))])

# Tune GridSearchCV, So, the smaller the value of alpha(hyper parameter), the higher would be the magnitude of the coefficients.
pipe_params = {'cvec__ngram_range': [(1, 2)]}

kf = KFold(n_splits=5, shuffle=True)
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=kf)
gs.fit(X_train, y_train)

# best_score_ is the 'Mean cross-validated score of the best_estimator.
# best_score is a measure that incorporates how your model performs in models that it has not seen.
print("Train score", gs.score(X_train, y_train))
print("Validation Score", gs.best_score_)
print("Predicted Test score", gs.score(X_test, y_test) * 100)
print(gs.best_params_)
print()

y_true, y_pred = y_test, gs.predict(X_test)
print(classification_report(y_true, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())
end = time.time()
print("execution time is", end - start)

AttributeError: lower not found

In [32]:
start = time.time()
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([('cvec', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
                 ('rf', RandomForestClassifier())])

# Tune GridSearchCV, So, the smaller the value of alpha(hyper parameter), the higher would be the magnitude of the coefficients.
pipe_params = {'cvec__ngram_range': [(1, 2)], 'rf__n_estimators':[20], 'rf__criterion':['gini']}

# kf = KFold(n_splits=3, shuffle=True)
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)
gs.fit(X_train, y_train)

# best_score_ is the 'Mean cross-validated score of the best_estimator.
# best_score is a measure that incorporates how your model performs in models that it has not seen.
print("Train score", gs.score(X_train, y_train))
print("Validation Score", gs.best_score_)
print("Predicted Test score", gs.score(X_test, y_test) * 100)
print(gs.best_params_)
print()

y_true, y_pred = y_test, gs.predict(X_test)
print(classification_report(y_true, y_pred))
print('Misclassified samples: %d' % (y_test != y_pred).sum())
end = time.time()
print("execution time is", end - start)

AttributeError: lower not found

In [33]:
# Random_forest_classifier
start = time.time()
from sklearn.ensemble import RandomForestClassifier
vectorizer = CountVectorizer(analyzer = 'word',stop_words=ENGLISH_STOP_WORDS)
x_vectors = vectorizer.fit_transform(dfModel.News.values)
y = dfModel.Type.values

X_train, X_test, y_train, y_test = train_test_split(x_vectors, y, test_size=0.3) # Get a training and test dataset

# n_estimators is the number of random forests to use
# n_jobs says to use all processors available
rf = RandomForestClassifier(n_estimators=20, criterion = 'gini')
rf.fit(X_train, y_train)

print("Random Forest Classfier")
print()
print("Training Accuracy ",rf.score(X_train, y_train))
print()
scores = cross_val_score(rf, X_train, y_train, cv=10)
print("Training Validated scores: Mean: %0.2f (+/- Std: %0.2f)" % (scores.mean(), scores.std() * 2))
print()

y_pred = rf.predict(X_test)
accuracy = r2_score(y_test, y_pred)
print ("R2 Score:", accuracy)
print()

print("Predicted Accuracy score {:.4}%".format(accuracy_score(y_test, y_pred) * 100))
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print()    

end = time.time()
print("execution time is",end - start)

# title = "Learning Curves (Random Forest)"
# cv = ShuffleSplit(n_splits=10, test_size=0.2)
# # cv=KFold(n_splits=100, random_state=None, shuffle=True)
# plot_learning_curve(rf, title, x_vectors, y, ylim=(0.5, 1.01), cv=cv, n_jobs=4)

KeyboardInterrupt: 