<a href="https://colab.research.google.com/github/IshaShah27/comey-nlp/blob/master/NLP_Test_2_Comey_hearing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Summary statistics

##### Word occurrence (word cloud)
##### Word occurrence by Comey vs Republican vs Democrat
##### Average length of statement Comey v Republican v Democrat

In [1]:
# Import packages
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell

# Make sure notebook prints all output a cell
InteractiveShell.ast_node_interactivity = "all"

# Make sure notebook prints plots
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Import testimony csv
url = "https://raw.githubusercontent.com/IshaShah27/comey-nlp/master/comeytestimony/qa.csv"

test = pd.read_csv(url)
test.head()

Unnamed: 0,Senator,Party Affiliation,Full Question,Comey,Comey Response
0,BURR,Republican,Do you have any doubt that Russia attempted to...,COMEY,None.
1,BURR,Republican,Do you have any doubt that the Russian governm...,COMEY,"No, no doubt."
2,BURR,Republican,Do you have any doubt the Russian government w...,COMEY,No.
3,BURR,Republican,Are you confident that no votes cast in the 20...,COMEY,I'm confident. When I left as director I had s...
4,BURR,Republican,"Director Comey, did the president at any time ...",COMEY,"Not to my understanding, no."


In [18]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
import re

In [0]:
# Reshape dataset - here, we're not interested in the question-answer relationship
comey = test[['Comey', 'Comey Response']].rename(columns = {'Comey':'name',
                                                  'Comey Response':'text'})
comey['group'] = "Comey"

# comey.head()
qalong = test[['Senator', 'Party Affiliation', 'Full Question']].rename(columns = {'Senator':'name',
                                                                       'Party Affiliation': 'group',
                                                                       'Full Question':'text'})
qalong = pd.concat([comey, qalong], sort=False).reset_index()

# Tokenize, remove stopwords, add bigrams
stop_rem = set(stopwords.words('english'))
qalong['word_token'] = qalong['text'].apply(lambda x: re.sub('[^A-z ]+', '', x)).apply(word_tokenize)
qalong['stop_rem'] = qalong['word_token'].apply(lambda x: [word for word in x if word not in stop_rem])

# Ignore bigrams for now
# qalong['bigrams'] = qalong['text'].apply(lambda x: re.sub('[^A-z ]+', '', x)).apply(lambda x: [x[i:i+2] for i in range(len(x)-1)])
qalong.head()

In [0]:
# Get word frequencies

# overall

# by groups

#### Sentiment analysis


##### Sentiment score overall (big 6 emotions + VAD)
##### Sentiment score by Comey, Repub, Dem compared to overall


In [47]:
# Import afinn
!pip install afinn
from afinn import Afinn
afinn = Afinn(language = 'en')

Collecting afinn
[?25l  Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)
[K     |██████▎                         | 10kB 14.6MB/s eta 0:00:01[K     |████████████▌                   | 20kB 3.2MB/s eta 0:00:01[K     |██████████████████▊             | 30kB 4.6MB/s eta 0:00:01[K     |█████████████████████████       | 40kB 3.0MB/s eta 0:00:01[K     |███████████████████████████████▏| 51kB 3.7MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.8MB/s 
[?25hBuilding wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25l[?25hdone
  Created wheel for afinn: filename=afinn-0.1-cp36-none-any.whl size=53453 sha256=4f45a5200be8cf1745b5a38610df0cea932648bf94f0f8e7757608f259181454
  Stored in directory: /root/.cache/pip/wheels/b5/1c/de/428301f3333ca509dcf20ff358690eb23a1388fbcbbde008b2
Successfully built afinn
Installing collected packages: afinn
Succes

In [52]:
# Calculate afinn score and find average over text (to control for length)
df_sentiment = qalong
df_sentiment['afinn_score'] = qalong['text'].apply(afinn.score)
df_sentiment['num_words'] = df_sentiment['text'].apply(lambda x: len(x.split()))
df_sentiment['afinn_avg'] = df_sentiment['afinn_score']/df_sentiment['num_words']

df_sentiment.head()

Unnamed: 0,index,name,text,group,word_token,stop_rem,afinn_score,num_words,afinn_avg
0,0,COMEY,None.,Comey,[None],[None],0.0,1,0.0
1,1,COMEY,"No, no doubt.",Comey,"[No, no, doubt]","[No, doubt]",-3.0,3,-1.0
2,2,COMEY,No.,Comey,[No],[No],-1.0,1,-1.0
3,3,COMEY,I'm confident. When I left as director I had s...,Comey,"[Im, confident, When, I, left, as, director, I...","[Im, confident, When, I, left, director, I, se...",1.0,15,0.066667
4,4,COMEY,"Not to my understanding, no.",Comey,"[Not, to, my, understanding, no]","[Not, understanding]",-1.0,5,-0.2


In [0]:
# Import vader
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [0]:
# Run vader
def vaderize(df, textfield):
    '''Compute the Vader polarity scores for a textfield.
    Returns scores and original dataframe.'''

    analyzer = SentimentIntensityAnalyzer()

    # print('Estimating polarity scores for %d cases.' % len(df))
    sentiment = df[textfield].apply(analyzer.polarity_scores)

    # convert to dataframe
    sdf = pd.DataFrame(sentiment.tolist()).add_prefix('vader_')

    # merge dataframes
    df_combined = pd.concat([df, sdf], axis=1)
    return df_combined

df_sentiment = vaderize(df_sentiment, 'text')
# df_vaderized.head(10)
df_sentiment.tail(10)

In [0]:
# Import NRC


In [54]:
# check if differences by group
sent_vals = ["afinn_avg","vader_neg", "vader_neu", "vader_pos", "vader_compound"]

sent_nameavg = df_sentiment.groupby(['name', 'group'])[sent_vals].mean()
sent_groupavg = df_sentiment.groupby('group')[sent_vals].mean()

sent_nameavg.head(20)
sent_groupavg.head()

Index(['index', 'name', 'text', 'group', 'word_token', 'stop_rem',
       'afinn_score', 'num_words', 'afinn_avg', 'vader_neg', 'vader_neu',
       'vader_pos', 'vader_compound'],
      dtype='object')

Unnamed: 0_level_0,Unnamed: 1_level_0,afinn_avg,vader_neg,vader_neu,vader_pos,vader_compound
name,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BLUNT,Republican,0.007636,0.046,0.888211,0.065842,0.076574
BURR,Republican,-0.025746,0.079429,0.884179,0.036357,-0.080821
COLLINS,Republican,0.009914,0.015462,0.943,0.041538,0.123085
COMEY,Comey,0.034179,0.116,0.708101,0.175903,0.127569
CORNYN,Republican,0.049508,0.0396,0.806933,0.1536,0.412653
COTTON,Republican,0.003337,0.025188,0.917625,0.057188,0.097394
FEINSTEIN,Democrat,0.123322,0.034,0.778636,0.187364,0.277973
HARRIS,Democrat,0.003522,0.017625,0.950708,0.031625,0.052987
HEINRICH,Democrat,-0.025601,0.063211,0.894421,0.042368,-0.017905
KING,Independent,0.037922,0.022,0.88285,0.09515,0.186295


Unnamed: 0_level_0,afinn_avg,vader_neg,vader_neu,vader_pos,vader_compound
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Comey,0.034179,0.116,0.708101,0.175903,0.127569
Democrat,0.014778,0.037545,0.893427,0.069027,0.137682
Independent,0.037922,0.022,0.88285,0.09515,0.186295
Republican,0.019847,0.038345,0.88403,0.077661,0.143945


### Classification (supervised)

##### Repub vs democrat - use different models
##### Verification stats (confusion matrix from best one) and interpretation 

#### Clustering / topic modeling (unsupervised)

##### Visualization
##### Interpretation

#### Findings / overall conclusions

In [0]:
#### Task 1 - Summary statistics
