# Part 3: KNOWLEDGE EXTRACTION

In [1]:
import pandas as pd
data_path = 'data_reddit.csv'
df_reddit = pd.read_csv(data_path)  

### Questions

**I. Import the .csv data into a dataframe `covid_subreddits` with the following columns: `author`, `posted_at`, `num_comments`, `score`, `selftext`, `subbredit`, `title`. Remove any data that is not part of the following subreddits: Coronavirus, CoronavirusUK, CoronavirusUS, COVID, COVID19.** 

In [2]:
# Cleaning data
df_tmp=df_reddit[['user_registered_at','user_upvote_ratio']].dropna()   # get the last two colums
df_reddit.drop(labels=['user_registered_at','user_upvote_ratio'],axis=1,inplace=True)   # remove last two colums
df_reddit.dropna(thresh=5,inplace=True)    # delete row if less than 5 non-empty value in this row.
df_reddit.reset_index(drop=True, inplace=True)  # reset the index after droping many rows
df_reddit=pd.concat([df_reddit,df_tmp],axis=1)  # splice two dataframe 
df_reddit['posted_at']=pd.to_datetime(df_reddit['posted_at'])  # convert post at to datetime 
# df_reddit['selftext'].fillna('',inplace=True) # fill NaN with blank string
# df_reddit.fillna(0.0,inplace=True) # fill na with 0.0 such as time and ratio
covid_subreddits=df_reddit[['author','posted_at','num_comments','score','selftext','subreddit','title']]
covid_subreddits=covid_subreddits[covid_subreddits['subreddit'].isin(['Coronavirus','CoronavirusUK','CoronavirusUS','COVID','COVID19'])]

covid_subreddits

Unnamed: 0,author,posted_at,num_comments,score,selftext,subreddit,title
9,61539,2020-03-11 07:36:00,8.0,1.0,,Coronavirus,Coronavirus updates: CDC says people who test ...
10,61539,2020-07-26 14:42:00,116.0,1.0,,Coronavirus,You're going to need more than one coronavirus...
11,61539,2020-12-11 17:55:00,1.0,1.0,,Coronavirus,"Congressional COVID-19 imp[***]e continues, Pe..."
12,61539,2020-07-27 14:13:00,13.0,1.0,,Coronavirus,First Phase 3 test of coronavirus vaccine cand...
13,61539,2020-07-29 08:55:00,6.0,1.0,,Coronavirus,'The hotspot of a hotspot of a hotspot': coron...
...,...,...,...,...,...,...,...
18340,Zuom,2020-03-15 21:05:00,5.0,1.0,,Coronavirus,Fresno County declared a state of emergency Su...
18341,Zuom,2020-03-25 15:55:00,20.0,1.0,,Coronavirus,Dad in China designs ‘safety pod’ to protect b...
18342,Zuom,2020-04-04 03:12:00,5.0,1.0,,Coronavirus,Rick Perry asks for elastic donations for his ...
18343,Zuom,2020-02-04 07:34:00,4.0,1.0,,Coronavirus,NYC homeless services worker[***] with coronav...


**II.Create a new dataframe: [5 marks]**

a. Make a new dataframe `subreddit_overview` with columns `subreddit` and `nbr_of_posts`, that counts how many posts have been made in each subreddit. 

In [3]:
subreddit_overview = covid_subreddits[['subreddit','title']]
subreddit_overview = subreddit_overview.groupby(['subreddit']).size().reset_index() 
subreddit_overview.rename(columns={0:'nbr_of_posts'},inplace=True)

subreddit_overview

Unnamed: 0,subreddit,nbr_of_posts
0,COVID,50
1,COVID19,200
2,Coronavirus,3078
3,CoronavirusUK,215
4,CoronavirusUS,162


b. Add a column `avg_title_length` to the `subreddit_overview` dataframe averaging the post `title` length for each subreddit.

In [4]:
b_tmp=covid_subreddits.loc[:,['title','subreddit']]  # create a temp dataframe for caculating
b_tmp['title_length']=b_tmp['title'].str.len()

# aggregation
b_tmp=b_tmp.groupby(['subreddit'])
b_tmp=b_tmp['title_length'].mean().reset_index()
subreddit_overview['avg_title_length'] = b_tmp['title_length']  # add avg_title_length column to final dataframe

c. Add a column `comment_text_ratio` to the `subreddit_overview` dataframe calculating the ratio between the length of the `selftext` and the number of comments.

In [5]:
c_tmp=covid_subreddits.loc[:,['subreddit','selftext','num_comments']]

c_tmp['comment_text_ratio']=c_tmp['selftext'].str.len()/c_tmp['num_comments']
c_tmp.drop(columns='selftext',inplace=True)

d. Print the subreddit_overview dataframe.

In [6]:
subreddit_overview=pd.merge(c_tmp,subreddit_overview,how='inner',on='subreddit')
subreddit_overview

Unnamed: 0,subreddit,num_comments,comment_text_ratio,nbr_of_posts,avg_title_length
0,Coronavirus,8.0,,3078,84.067901
1,Coronavirus,116.0,,3078,84.067901
2,Coronavirus,1.0,,3078,84.067901
3,Coronavirus,13.0,,3078,84.067901
4,Coronavirus,6.0,,3078,84.067901
...,...,...,...,...,...
3700,COVID19,23.0,,200,91.500000
3701,COVID19,33.0,,200,91.500000
3702,COVID19,0.0,,200,91.500000
3703,COVID19,14.0,,200,91.500000


**III. Perform sentiment analysis on the `selftext` of the `covid_subreddits` dataframe. Calculate and print: the average sentiment per subreddit, the subreddits that are positive overall (if any), and the subreddits that are negative overall (if any). Print the resulting dataframes to .csv files named respectively `sa_results.csv`, `positive_subs.csv`, and `negative_subs.csv`.**

In [7]:
import eng_spacysentiment
import spacy
nlp_s = eng_spacysentiment.load() # load in the sentiment model with default parameters
nlp_m = spacy.load("en_core_web_md")

In [8]:
sentiment=covid_subreddits.dropna(subset='selftext').loc[:,['subreddit','selftext']] # remove rows if selftext=NaN 
sentiment[['positive','negative']]=''   # insert two blank rows
for i in range(len(sentiment)):
    positive=0;negative=0
    doc=nlp_m(sentiment['selftext'].values[i])  # get cell value in selftext columns
    numOfSents = len(list(doc.sents))
    for sent in doc.sents:  # loop in one cell
        doc = nlp_s(str(sent))
        positive = positive + doc.cats["positive"]  # sum sentiment value of each sentence
        negative = negative + doc.cats["negative"]
    # cell sentiment equals sum sentiment divides number of sentence
    sentiment['positive'].values[i] = positive/numOfSents
    sentiment['negative'].values[i] = negative/numOfSents
# caculate overall sub sentiment
sa_results=sentiment.groupby(['subreddit'])
sa_results=sa_results[['positive','negative']].mean().reset_index()
postive_subs=sa_results.loc[sa_results['positive']>sa_results['negative'],'subreddit']
negative_subs=sa_results.loc[sa_results['positive']<sa_results['negative'],'subreddit']
# export csv
postive_subs.to_csv("postive_subs.csv")
negative_subs.to_csv("negative_subs.csv")
sa_results.to_csv("sa_results.csv")

**IV. Find the top 2 posters in the CoronavirusUK subreddit and calculate and print the similarity between the titles of all posts by each user. Return only one value.**

In [9]:
top_posters=covid_subreddits.loc[:,['author','subreddit']]
top_posters=top_posters[top_posters['subreddit'].isin(['CoronavirusUK'])]   # get column subreddit=CoronavirusUK
# count how many times the author posts in
top_posters=top_posters.groupby(['author']).count().reset_index().sort_values('subreddit').tail(2)
top_posters=pd.merge(covid_subreddits,top_posters,how='inner',on='author') # merge it to origin df to sum title
top_posters=top_posters.groupby(['author'])
top_posters=top_posters['title'].sum().reset_index()

poster1=nlp_m(top_posters['title'].values[0])
poster2=nlp_m(top_posters['title'].values[1])

poster1.similarity(poster2)

0.9462759212218078

**V. Extract the Named Entities/Keywords of the CoronavirusUK using an appropriate Python library for each of the following months: January 2020, May 2020, September 2020. Print which Named Entities/keywords appear in all 3 three months (if any) and which only appear in each month exclusively (if any).**

In [11]:
entities=covid_subreddits[covid_subreddits['subreddit'].isin(['CoronavirusUK'])]
entities["tokens"]=entities["title"].apply(nlp_m)
jan_entities=entities.loc[entities['posted_at'].between('2020-01-01','2020-01-31')]
may_entities=entities.loc[entities['posted_at'].between('2020-05-01','2020-05-31')]
sep_entities=entities.loc[entities['posted_at'].between('2020-09-01','2020-09-30')]
jan_list=[];may_list=[];sep_list=[]
for tokens in jan_entities['tokens'].values:
    for ent in tokens.ents:
        jan_list.append((ent.text,ent.label_))
for tokens in may_entities['tokens'].values:
    for ent in tokens.ents:
        may_list.append((ent.text,ent.label_))
for tokens in sep_entities['tokens'].values:
    for ent in tokens.ents:
        sep_list.append((ent.text,ent.label_))

all_three = set(jan_list).intersection(may_list,sep_list)
only_jan = set(jan_list).difference(may_list,sep_list)
only_may = set(may_list).difference(jan_list,sep_list)
only_sep = set(sep_list).difference(may_list,jan_list)
print(all_three,'\n\n',only_jan,'\n\n',only_may,'\n\n',only_sep)

{('England', 'GPE'), ('UK', 'GPE')} 

 {('early years', 'DATE'), ('first', 'ORDINAL'), ('ZOE Update', 'PERSON'), ('November 1st 2020', 'DATE'), ('three', 'CARDINAL'), ('Tuesday 01 December', 'DATE'), ('Tony Jones', 'PERSON'), ('Ronald McDonald', 'PERSON'), ('Week 43 2020', 'DATE')} 

 {('1st 2', 'DATE'), ('Govt', 'ORG'), ('Vent Megathread - May', 'PRODUCT'), ('Today', 'DATE'), ('June 15', 'DATE'), ('Peter Hitchens', 'PERSON'), ('29', 'CARDINAL'), ('Oxfordshire', 'GPE'), ('Europe', 'LOC'), ('Denmark', 'GPE'), ('U.K.', 'GPE'), ('May 26th 2020', 'DATE'), ('Cummings', 'GPE'), ('Greece', 'GPE'), ('China', 'GPE'), ('29,427', 'CARDINAL')} 

 {('10 days', 'DATE'), ('Vent Megathread - September', 'PRODUCT'), ('Norwegian', 'NORP'), ('Covid-19 Deaths', 'PERSON'), ('more than a year', 'DATE'), ('only 3%', 'PERCENT'), ('Brexit', 'PERSON'), ('Britons', 'PERSON'), ('an Office for National Statistics', 'ORG'), ('six', 'CARDINAL'), ('two weeks', 'DATE'), ('Friday 09 October', 'DATE'), ('July 9th 2020',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entities["tokens"]=entities["title"].apply(nlp_m)
