In [1]:
import pandas as pd
#import random 
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Below is only for local testing purpuse, not included in submissison
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import numpy as np
import itertools
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix



# Below model if not choosen is not necessary 
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier

#from sklearn.model_selection import train_test_split
import nltk
## Download Resources
nltk.download('punkt')
nltk.download('words')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download("stopwords")

from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

from nltk.corpus import words, stopwords
#from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

############
! pip install syllables
import syllables
import string
from scipy.sparse import coo_matrix, hstack

import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /home/claudezyx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/claudezyx/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/claudezyx/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/claudezyx/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/claudezyx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
def appendArticles(articleList, basePath):
    contents = ''
    for articleNumber in articleList:
        f = open(basePath+str(articleNumber)+".txt", "r")
        contents = f.read()+";"+contents
        f.close()
    return contents

def stemTweetToWordList(text, stopWords):
    ps = PorterStemmer()
    tweet =text.lower() # lower case
    tokenizer = RegexpTokenizer(r'\w+')  
    wordList = tokenizer.tokenize(tweet)
     # remove stop words and store the stem version. 
    return [ps.stem(word) for word in wordList if word not in stopWords]

def startNumber(text):
    if text[:1].isdigit():
        return 1
    else:
        return 0

def containQuestion(text):
    if '?' in text or '!' in text:
        return 1
    else:
        return 0
    
def numberSyllable(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    number_words = len(text.split())
    total = 0
    for word in text.split():
        total += syllables.estimate(word)
    return total/number_words

def assignLength(row, colName):
    return len(row[colName])

def POSTagging(text):
    tokens = nltk.word_tokenize(text)
    return nltk.pos_tag(tokens) 

def adjectives(lists):
    count = 0
    adjective_tag = ['JJ','JJR','JJS']
    for word in lists:
        if word[1] in adjective_tag:
            count +=1
    return count

def nouns(lists):
    count = 0
    noun_tag = ['NN','NNS','NNP']
    for word in lists:
        if word[1] in noun_tag:
            count +=1
    return count

def verbs(lists):
    count = 0
    verb_tag = ['VB','VBD','VBG','VBN','VBP','VBZ']
    for word in lists:
        if word[1] in verb_tag:
            count +=1
    return count

def firstWord(lists):
    tag = ['VB','VBD','VBG','VBN','VBP','VBZ','NN','NNS','NNP','JJ','JJR','JJS']
    word = lists[0]
    if word in tag:
        return 1
    else:
        return 0
    
def lastWord(lists):
    tag = ['VB','VBD','VBG','VBN','VBP','VBZ','NN','NNS','NNP','JJ','JJR','JJS']
    word = lists[-1]
    if word in tag:
        return 1
    else:
        return 0
    
def FPP(lists):
    count = 0
    FPP_tag = ['I','ME','WE','US','MY','MINE','OUR','OURS']
    for word in lists:
        if word[0].upper() in FPP_tag:
            count +=1
    return count


In [3]:
basePath = os.path.dirname(os.path.abspath("train.json"))
# 0:false, 1:partly true, 2:true
claim = pd.read_json(open(basePath + "/train/train.json", "r", encoding="utf8"))
txtPath = basePath+"/train/train_articles/"

In [4]:
claim['articleText'] = claim.apply(lambda row: appendArticles(row['related_articles'], txtPath) ,axis=1)

In [5]:
stopWords = stopwords.words('english')
claim['articleText'] = claim.apply(lambda row: stemTweetToWordList(row['articleText'], stopWords) ,axis=1)

In [7]:
sid = SentimentIntensityAnalyzer()
claim['claimSentiment'] = claim.apply(lambda row: sid.polarity_scores(row['claim'])['compound'] ,axis=1)
claim['SentimentAdjust'] = claim['claimSentiment']+1

In [8]:
#check if claim starts with a number
claim['start_number'] = claim.apply(lambda row: startNumber(row['claim']) ,axis=1)

In [9]:
#check if claim contain any ? or !
claim['containQX'] = claim.apply(lambda row: containQuestion(row['claim']) ,axis=1)
#number of words in the claim
claim['titleWords'] = claim.apply(lambda row: len(row['claim'].split()) ,axis=1)
#number of related articles to the claim
claim['numberArticle'] = claim.apply(lambda row: len(row['related_articles']) ,axis=1)
#number of average syllables in each word in the claim
claim['claimSyllable'] = claim.apply(lambda row: numberSyllable(row['claim']) ,axis=1)
claim['articleLength'] = claim.apply(lambda row: assignLength(row, 'articleText'), axis=1)
#average length of word in related articles to the claim
#claim['articleLength'] = claim.apply(lambda row: assignLength(row, 'articleText')/row['numberArticle'], axis=1)

count = lambda l1,l2: sum([1 for x in l1 if x in l2])
#get number of punctuation in the claim
claim['claimPunc'] = claim.apply(lambda row: count(row['claim'],set(string.punctuation)), axis=1)
#get number of punctuation in each related articles
claim['articlePunc'] = claim.apply(lambda row: count(row['articleText'],set(string.punctuation))/row['numberArticle'], axis=1)


modifications

In [10]:
#get a list of words and their POS for the claim
claim['claimPOS'] = claim.apply(lambda row: POSTagging(row['claim']), axis=1)
#check number of adjectives, nouns, and verbs in the claim
claim['claimAdj'] = claim.apply(lambda row: adjectives(row['claimPOS']), axis=1)
claim['claimNoun'] = claim.apply(lambda row: nouns(row['claimPOS']), axis=1)
claim['claimVerb'] = claim.apply(lambda row: verbs(row['claimPOS']), axis=1)
claim['claimPOSratio'] = claim.apply(lambda row: (row['claimAdj']+row['claimNoun']+row['claimVerb'])/len(row['claimPOS']), axis=1)
claim['claimFirst'] = claim.apply(lambda row: firstWord(row['claimPOS']), axis=1)
claim['claimLast'] = claim.apply(lambda row: lastWord(row['claimPOS']), axis=1)
claim['claimFPP'] = claim.apply(lambda row: FPP(row['claimPOS']), axis=1)

In [11]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [12]:
claim['articleText'] = claim.apply(lambda row: ' '.join(row['articleText']), axis=1)

In [14]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', \
                                   ngram_range=(1, 1), max_df=0.7, max_features=2500) 

In [15]:
tfidf_vectorizer.fit(claim['articleText']) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.7, max_features=2500,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [16]:
tfidf_train = tfidf_vectorizer.transform(claim['articleText'])

<15555x2500 sparse matrix of type '<class 'numpy.float64'>'
	with 9161304 stored elements in Compressed Sparse Row format>

In [None]:
max_df = 0.5
while max_df < 1:
#for numFeature in range(3000, 5000, 500):
    #print('number of feature is %f' %numFeature)
    print('max_df is %f' %max_df)
    n_splits = 2
    kfold = KFold(n_splits=n_splits)
    kfold.get_n_splits(claim)
    for train_idx, test_idx in kfold.split(claim):
        X_train, X_test = claim.iloc[train_idx], claim.iloc[test_idx]    
        y_train, y_test = X_train['label'], X_test['label']
        X_train_text, X_test_text= X_train['articleText'], X_test['articleText']

        ###
#         #preprocess data
#         normalizedList = ['numberArticle', 'articleLength', 'titleWords','claimPunc', 'articlePunc']
#         for column in normalizedList:
#             scaler = StandardScaler()
#             X_train[column] = scaler.fit_transform(pd.DataFrame(X_train[column])) - \
#                                               min(scaler.fit_transform(pd.DataFrame(X_train[column])))
#             X_test[column] = scaler.transform(pd.DataFrame(X_test[column])) - \
#                                               min(scaler.transform(pd.DataFrame(X_test[column])))
#         ###

        # Initialize the `tfidf_vectorizer` 
        tfidf_vectorizer = TfidfVectorizer(stop_words='english', \
                                           ngram_range=(1, 2), max_df=max_df, max_features=2500) 
        # Fit and transform the training data 
        tfidf_train = tfidf_vectorizer.fit_transform(X_train_text) 
        # Transform the test set 
        tfidf_test = tfidf_vectorizer.transform(X_test_text)

        featureToBeAdded = ['SentimentAdjust','numberArticle','articleLength','start_number',\
                            'containQX','titleWords','claimSyllable','claimPunc','articlePunc',\
                            'claimAdj', 'claimNoun', 'claimVerb', 'claimPOSratio', 'claimFirst',\
                            'claimLast', 'claimFPP',
                            #'claimSentiment', 'claimPOS'
                           ]

        combResults = tfidf_train
        for featureName in featureToBeAdded:
            colToBeAdded = coo_matrix(X_train[featureName]).transpose()
            combResults = hstack([combResults, colToBeAdded])

        combResultsTest = tfidf_test
        for featureName in featureToBeAdded:
            colToBeAdded = coo_matrix(X_test[featureName]).transpose()
            combResultsTest = hstack([combResultsTest, colToBeAdded])

        #multi_class : str, {‘ovr’, ‘multinomial’, ‘auto’}, optional (default=’ovr’)
        #solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’).
        clf = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial', max_iter=10000)
        #clf = MultinomialNB(alpha=alpha) 
        #clf = ComplementNB()
        #clf =  SVC()
        #clf = PassiveAggressiveClassifier(tol=50)
        clf.fit(combResults, y_train)
        pred = clf.predict(combResultsTest)
        score = accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)
        #cm = confusion_matrix(y_test, pred, labels=[0, 1, 2])
        #plot_confusion_matrix(cm, classes=[0, 1, 2])
        max_df += 0.1

max_df is 0.500000
accuracy:   0.613
accuracy:   0.612
max_df is 0.700000
accuracy:   0.612
accuracy:   0.615
max_df is 0.900000


In [21]:
# n_splits = 2
# kfold = KFold(n_splits=n_splits)
# kfold.get_n_splits(claim)
# for train_idx, test_idx in kfold.split(claim):
#     X_train, X_test = claim.iloc[train_idx], claim.iloc[test_idx]    
#     y_train, y_test = X_train['label'], X_test['label']
#     X_train_text, X_test_text= X_train['articleText'], X_test['articleText']
    
#     # Initialize the `tfidf_vectorizer` 
#     tfidf_vectorizer = TfidfVectorizer(stop_words='english', \
#                                        ngram_range=(2, 2), max_df=0.7, max_features=3000) 
#     # Fit and transform the training data 
#     tfidf_train = tfidf_vectorizer.fit_transform(X_train_text) 
#     # Transform the test set 
#     tfidf_test = tfidf_vectorizer.transform(X_test_text)

#     featureToBeAdded = ['SentimentAdjust',\
#                         #'claimSentiment',\
#                         'numberArticle','articleLength',\
#                         'start_number',\
#                         'containQX', \
#                         'titleWords',\
#                         'claimSyllable',\
#                         'claimPunc', 'articlePunc',\
#                        ]
#     combResults = tfidf_train
#     for featureName in featureToBeAdded:
#         colToBeAdded = coo_matrix(X_train[featureName]).transpose()
#         combResults = hstack([combResults, colToBeAdded])

#     combResultsTest = tfidf_test
#     for featureName in featureToBeAdded:
#         colToBeAdded = coo_matrix(X_test[featureName]).transpose()
#         combResultsTest = hstack([combResultsTest, colToBeAdded])

#     #multi_class : str, {‘ovr’, ‘multinomial’, ‘auto’}, optional (default=’ovr’)
#     #solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’).
#     clf = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
#     #clf = MultinomialNB() 
#     #clf = ComplementNB()
#     #clf =  SVC()
#     #clf = PassiveAggressiveClassifier(tol=50)
#     clf.fit(combResults, y_train)
#     pred = clf.predict(combResultsTest)
#     score = accuracy_score(y_test, pred)
#     print("accuracy:   %0.3f" % score)
#     cm = confusion_matrix(y_test, pred, labels=[0, 1, 2])
#     plot_confusion_matrix(cm, classes=[0, 1, 2])

In [14]:
#number of feature is 1500.000000
#accuracy:   0.612
#accuracy:   0.611
#number of feature is 2000.000000
#accuracy:   0.612
#accuracy:   0.617
#number of feature is 2500.000000
# number of feature is 3000.000000
# accuracy:   0.614
# accuracy:   0.620
# number of feature is 3500.000000
# accuracy:   0.617
# accuracy:   0.619
# number of feature is 4000.000000
# accuracy:   0.616
# accuracy:   0.619
# number of feature is 4500.000000
# accuracy:   0.614
# accuracy:   0.618