# Analysis of New York Times Opinion and News Articles

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from misc import create_data

In [2]:
# Initialize count and tf-idf vectorizers
tf_idf_vectorizer = TfidfVectorizer(stop_words='english')
count_vectorizer = CountVectorizer(stop_words='english')

In [3]:
# Get the data from json file
data = create_data()
# Extract text from articles
articles_text = data['text']

In [4]:
# Get count and tf-idf matrices 
tf_idf_matrix = tf_idf_vectorizer.fit_transform(articles_text)
count_matrix = count_vectorizer.fit_transform(articles_text)

In [5]:
# Extract features
tf_idf_feature_names = tf_idf_vectorizer.get_feature_names()
count_feature_names = count_vectorizer.get_feature_names()

In [6]:
# Number of topics
n_topics = 3

# Number of top words per topic
n_top_words = 20

In [7]:
# Run LDA
lda = LatentDirichletAllocation(n_components=n_topics, learning_method='online').fit(count_matrix)

# Run NMF
nmf = NMF(n_components=n_topics, init='nndsvd').fit(tf_idf_matrix)

In [8]:
# Function to display the top words for lda and nmf
def show_topics(model, feature_names,n_top_words_per_topic):
    """
    Shows the number of of words per topic for each topic
    :param model Scikit learn model
    :param feature_names vector
    :param n_top_words_per_topic int
    """
    for topic_index, topic in enumerate(model.components_):
        print("Topic %d:" %(topic_index))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words_per_topic-1:-1]]))
        
# Run show_topics on lda and nmf
print("LDA results:")
show_topics(lda,count_feature_names,n_top_words)
print()
print("NMF results:")
show_topics(nmf,tf_idf_feature_names,n_top_words) 

LDA results:
Topic 0:
mr trump said clinton campaign republican mrs party president people obama new state election voters political like states presidential vote
Topic 1:
said mr police government united people state states officials country china year military american president war new officers years news
Topic 2:
said people like new years year world time city just children work percent ms school health women family life 000

NMF results:
Topic 0:
trump mr clinton mrs campaign republican said party voters republicans donald presidential president hillary election obama convention sanders nominee democratic
Topic 1:
police said people officers court black city law ms department justice mr year new federal officer like state women school
Topic 2:
mr said united russia turkey government china military syria european islamic states britain russian state war union american syrian erdogan


In [9]:
# Run k-means on both count_matrix and tf_idf_matrix
km_count = KMeans(n_clusters=n_topics, init='k-means++', max_iter=100, n_init=1).fit(count_matrix)
km_tf_idf = KMeans(n_clusters=n_topics, init='k-means++', max_iter=100, n_init=1).fit(tf_idf_matrix)

In [10]:
def show_clusters(model,feature_names, top_words, topics):
    """
    Shows top words per cluster
    :param model Scikit learn model
    :param feature_names vector
    :param top_words int
    :param topics int
    """
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = feature_names
    
    for i in range(topics):
        print("Cluster %d: " % i, end='')
        for ind in order_centroids[i, :top_words]:
            print(' %s' % terms[ind])
        print()

# Run show_clusters on K-means objects and feature_names vectors
print("K-means fit on a tf_idf matrix with tf_idf features")
show_clusters(km_tf_idf, tf_idf_feature_names,n_top_words,n_topics)
print("K-means fit on a tf_idf matrix with count features")
show_clusters(km_tf_idf, count_feature_names,n_top_words,n_topics)
print("K-means fit on a count matrix with tf_idf features")
show_clusters(km_count, tf_idf_feature_names, n_top_words,n_topics)
print("K-means fit on a count matrix with count features")
show_clusters(km_count, count_feature_names, n_top_words,n_topics)

K-means fit on a tf_idf matrix with tf_idf features
Cluster 0:  said
 mr
 police
 people
 ms
 new
 court
 state
 government
 year
 city
 like
 law
 years
 party
 officers
 time
 percent
 world
 page

Cluster 1:  trump
 mr
 clinton
 mrs
 campaign
 said
 republican
 party
 voters
 president
 republicans
 donald
 presidential
 sanders
 hillary
 obama
 election
 convention
 nominee
 democratic

Cluster 2:  said
 mr
 united
 military
 turkey
 syria
 russia
 islamic
 china
 government
 states
 state
 war
 american
 russian
 saudi
 iran
 nations
 syrian
 nuclear

K-means fit on a tf_idf matrix with count features
Cluster 0:  said
 mr
 police
 people
 ms
 new
 court
 state
 government
 year
 city
 like
 law
 years
 party
 officers
 time
 percent
 world
 page

Cluster 1:  trump
 mr
 clinton
 mrs
 campaign
 said
 republican
 party
 voters
 president
 republicans
 donald
 presidential
 sanders
 hillary
 obama
 election
 convention
 nominee
 democratic

Cluster 2:  said
 mr
 united
 military
 turk

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [None]:
# Pipelines for each classifier 
pipeline_multinomial_nb_count = Pipeline([('vectorizer', CountVectorizer()),('classifier', MultinomialNB())])
pipeline_multinomial_nb_tf_idf = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB())])
pipeline_logistic_regression_count = Pipeline([('vectorizer', CountVectorizer()),('classifier', LogisticRegression())])
pipeline_logistic_regression_tf_idf = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', LogisticRegression())])
pipeline_random_forest_count = Pipeline([('vectorizer', CountVectorizer()),('classifier', RandomForestClassifier())])
pipeline_random_forest_tf_idf = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', RandomForestClassifier())])
pipeline_decision_tree_count = Pipeline([('vectorizer', CountVectorizer()), ('classifier', DecisionTreeClassifier())])
pipeline_decision_tree_tf_idf = Pipeline([('vectorizer', CountVectorizer()), ('classifier', DecisionTreeClassifier())])

In [None]:
# Use 10-fold cross validation to determine accuracy of each method
k_fold = KFold(n_splits=10)
scores_count = {'Multinomial Naive Bayes':[], 'Logistic Regression':[], 'Random Forests': [], 'Decision Trees':[]}
scores_tf_idf = {'Multinomial Naive Bayes':[], 'Logistic Regression':[], 'Random Forests': [], 'Decision Trees':[]}

def compute_scores(pipeline,type_of_classifier,scores):
    """
    Compute the scores for each classifier given a corresponding pipeline and scores them in a dictionary
    :param pipeline sklean Pipeline
    :param type_of_classifier str
    :param scores dictionary
    """
    for train_indices, test_indices in k_fold.split(data['text']):
        train_text = data.iloc[train_indices]['text'].values
        train_y = data.iloc[train_indices]['type'].values

        test_text = data.iloc[test_indices]['text'].values
        test_y = data.iloc[test_indices]['type'].values

        pipeline.fit(train_text, train_y)
        predictions = pipeline.predict(test_text)

        score = f1_score(test_y, predictions,average='weighted')
        scores[type_of_classifier].append(score)
    scores[type_of_classifier] = sum(scores[type_of_classifier])/len(scores[type_of_classifier])
        
# Multinomial NB
compute_scores(pipeline_multinomial_nb_count, 'Multinomial Naive Bayes', scores_count)
compute_scores(pipeline_multinomial_nb_tf_idf, 'Multinomial Naive Bayes', scores_tf_idf)

# Logistic Regression
compute_scores(pipeline_logistic_regression_count, 'Logistic Regression', scores_count)
compute_scores(pipeline_logistic_regression_tf_idf, 'Logistic Regression', scores_tf_idf)

# Random Forests
compute_scores(pipeline_random_forest_count, 'Random Forests', scores_count)
compute_scores(pipeline_random_forest_tf_idf, 'Random Forests', scores_tf_idf)

# Decision Trees
compute_scores(pipeline_decision_tree_count, 'Decision Trees', scores_count)
compute_scores(pipeline_decision_tree_tf_idf, 'Decision Trees', scores_tf_idf)

# Display scores
print(scores_count)
print(scores_tf_idf)