In [1]:
# Import packages
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
%run C:/Users/siebe/Documents/JT_Charts.ipynb

ps = PorterStemmer()
stopwords_english = set(stopwords.words('english'))

# For reference: 
### a high cosine similarity means text is similar
### a low cosine similarity means text is different

In [2]:
# Cosine similarity
def cosine(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    cosine_similarity = dot_product / (norm_a * norm_b)
    return cosine_similarity

a = [1,1,1,1,1,1,0,0,0,0]
b = [0,0,0,0,0,0,1,1,1,1]
print("same", '{:1.1f}'.format(cosine(a, a)))
print("opposite", '{:1.1f}'.format(cosine(a, b)))

same 1.0
opposite 0.0


In [3]:
# Corpus
## Chinese newspapers
china_daily=pd.read_csv('China Daily.csv')
people_daily=pd.read_csv('People\'s Daily.csv')
xinhua_agent=pd.read_csv('Xinhua Agent.csv')

CH=pd.concat([china_daily, people_daily, xinhua_agent],axis=0,ignore_index=True)

## US newspapers
Wall_Street_Journal = pd.read_csv('Wall Street Journal.csv')
Washington_post = pd.read_csv('Washington Post.csv')
New_York_Times = pd.read_csv('New York Times.csv')

US=pd.concat([Wall_Street_Journal, New_York_Times, Washington_post],axis=0,ignore_index=True)

# Adjust columns of dataset and drop missings
def corpus_create(news, val):
    news = news.drop(['title'], axis=1)
    news = news.dropna(how='all')
    news['newspaper'] = val
    return news

CH = corpus_create(CH, 'Chinese newspapers')
US = corpus_create(US, 'US newspapers')

# Combine US and Chinese newspapers
corpus = pd.concat([US, CH],axis=0,ignore_index=True)
corpus.head()

Unnamed: 0,text,newspaper
0,[Financial Analysis and Commentary]\nPolitical...,US newspapers
1,China appears to be making its long-expected m...,US newspapers
2,[Financial Analysis and Commentary]\nIn his pu...,US newspapers
3,The possibility that Beijing would call in its...,US newspapers
4,Hong Kong -- President Xi Jinping wanted Tuesd...,US newspapers


In [4]:
keywords = ["\'s", "china daily", "people daily", "xinhua", 
            "wall street journal", "new york times", "washington post",
            "nyt", "washington",
            "editorial", "commentary", "crdito", "keith"]

# Case removal, non-alpha removal, keyword removal, and stemming
def corpus_clean(corpus):
    corpus['text'] = corpus['text'].str.split()
    corpus['text'] = corpus['text'].apply(lambda x: [re.sub(r'[^a-zA-Z]', "",y.lower()) for y in x])
    corpus['text'] = corpus['text'].apply(lambda x: [y for y in x if y not in keywords])
    corpus['text'] = corpus['text'].apply(lambda x: [ps.stem(y) for y in x])
    corpus['text'] = corpus['text'].apply(lambda x: [' '.join(x)])
    corpus['text'] = pd.Series(corpus['text']).astype(str)
    return corpus['text']

corpus = corpus_clean(corpus)
CH = corpus_clean(CH)
US = corpus_clean(US)

In [5]:
# DTM
vectorizer = CountVectorizer(binary=True) 
vectorizer.fit(corpus)
CH_dtm = vectorizer.transform(CH)
US_dtm = vectorizer.transform(US)

print("Count of terms", len(vectorizer.get_feature_names()))

Count of terms 17054


In [6]:
# Collapse all terms
a = [max(x) for x in zip(*CH_dtm.toarray())]
b = [max(x) for x in zip(*US_dtm.toarray())]

cosine_similarity = cosine(a, b)

print("US and Chinese newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))

US and Chinese newspaper cosine similarity
0.5508


# US and Chinese newspaper cosine similarity (0.55)
### Cosine similarity is mid-range

In [7]:
# Chinese newspapers
china_daily = corpus_create(china_daily, 'China Daily')
people_daily = corpus_create(people_daily, 'People\'s Daily')
xinhua_agent = corpus_create(xinhua_agent, 'Xinhua Agent')

corpus=pd.concat([china_daily, people_daily, xinhua_agent],axis=0,ignore_index=True)

corpus = corpus_clean(corpus)
china_daily = corpus_clean(china_daily)
people_daily = corpus_clean(people_daily)
xinhua_agent = corpus_clean(xinhua_agent)



In [8]:
# DTM
vectorizer = CountVectorizer(binary=True) 
vectorizer.fit(corpus)
china_daily_dtm = vectorizer.transform(china_daily)
people_daily_dtm = vectorizer.transform(people_daily)
xinhua_agent_dtm = vectorizer.transform(xinhua_agent)

print("Count of terms", len(vectorizer.get_feature_names()))

Count of terms 8333


In [9]:
# Collapse all terms
a = [max(x) for x in zip(*china_daily_dtm.toarray())]
b = [max(x) for x in zip(*people_daily_dtm.toarray())]
c = [max(x) for x in zip(*xinhua_agent_dtm.toarray())]

# Cosine similarity
cosine_similarity = cosine(a, b)

print("China Daily and People Daily newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))
print("")

# Cosine similarity
cosine_similarity = cosine(b, c)

print("People Daily and Xinhua newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))
print("")

# Cosine similarity
cosine_similarity = cosine(a, c)

print("China Daily and Xinhua newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))
print("")

China Daily and People Daily newspaper cosine similarity
0.6789

People Daily and Xinhua newspaper cosine similarity
0.7153

China Daily and Xinhua newspaper cosine similarity
0.6364



# Chinese newspapers cosine similarity (0.64-0.72)
### Cosine similarity is higher meaning they are more similar

In [10]:
# US newspapers
Wall_Street_Journal = corpus_create(Wall_Street_Journal, 'Wall Street Journal')
Washington_post = corpus_create(Washington_post, 'Washington Post')
New_York_Times = corpus_create(New_York_Times, 'New York Times')

corpus=pd.concat([Wall_Street_Journal, Washington_post, New_York_Times],axis=0,ignore_index=True)

corpus = corpus_clean(corpus)
Wall_Street_Journal = corpus_clean(Wall_Street_Journal)
Washington_post = corpus_clean(Washington_post)
New_York_Times = corpus_clean(New_York_Times)

In [11]:
# DTM
vectorizer = CountVectorizer(binary=True) 
vectorizer.fit(corpus)
Wall_Street_Journal_dtm = vectorizer.transform(Wall_Street_Journal)
Washington_post_dtm = vectorizer.transform(Washington_post)
New_York_Times_dtm = vectorizer.transform(New_York_Times)

print("Count of terms", len(vectorizer.get_feature_names()))

Count of terms 14848


In [12]:
# Collapse all terms
a = [max(x) for x in zip(*Wall_Street_Journal_dtm.toarray())]
b = [max(x) for x in zip(*Washington_post_dtm.toarray())]
c = [max(x) for x in zip(*New_York_Times_dtm.toarray())]

# Cosine similarity
cosine_similarity = cosine(a, b)

print("Wall Street Journal and Washington Post newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))
print("")

# Cosine similarity
cosine_similarity = cosine(b, c)

print("Washington Post and New York Times newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))
print("")

# Cosine similarity
cosine_similarity = cosine(a, c)

print("Wall Street Journal and New York Times newspaper cosine similarity")
print('{:1.4f}'.format(cosine_similarity))
print("")

Wall Street Journal and Washington Post newspaper cosine similarity
0.6195

Washington Post and New York Times newspaper cosine similarity
0.6303

Wall Street Journal and New York Times newspaper cosine similarity
0.6220



# US newspapers cosine similarity (0.62-0.63)
### Cosine similarity is a little high