<h1>Homework 4 - Zachary Jernigan</h1>

<h3>Loading data and implementing packages</h3>

In [1]:
import pandas as pd
import json

In [2]:
with open('universities.json','r') as f:
    raw_data = json.load(f)

df_main = pd.DataFrame(raw_data)

df_main.head()

Unnamed: 0,college,content,nrefs,nlinks
0,Air Force Institute of Technology,The Air Force Institute of Technology (AFIT) i...,142,253
1,Albert Einstein College of Medicine,The Albert Einstein College of Medicine is a p...,88,234
2,American University,The American University (AU or American) is a ...,148,423
3,Arizona State University,Arizona State University (Arizona State or ASU...,515,880
4,Arkansas State University,Arkansas State University (A-State or ASU) is ...,109,281


<h3>Starting Analysis with Preprocessing</h3>

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import string

In [4]:
# these lines download the models and really only need to be run once ever
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('maxent_ne_chunker')
nltk.download('words') 
nltk.download('treebank')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/jovyan/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk

True

In [5]:
sws = set(stopwords.words('english'))
stemmer = PorterStemmer()
punc = set(string.punctuation)

#These words were added later on because they kept skewing the data away from topics and toward repeated words
custom_filter = set(['university', 'college', 'institute', 'state', 'school', 'penn','philadelphia','ucla','umass','pitt'])

#I borrowed this code from stack overflow instead of writing it myself
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]
lowercase_states = [state.lower() for state in states]

#want to remove repeated references to name of uni
uni_names = set()
for name in df_main['college']:
    split_name = name.lower().split()
    uni_names.update(split_name)


#Given a text, 
#1. tokenize the lowercase words
#2. for all the words that were tokenized look for only words in not stopwords or punctutation
# I later added removal of ==== and included len(word)>3 to remove abbreviations of unis like usc
#3. stem word then place into list. Return list of processed words
def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    tokened_words = []

    for word in tokens:
        if word not in sws and word not in punc and word not in custom_filter and word not in uni_names and word != "====" and len(word)>3 and word not in lowercase_states:
            stemmed = stemmer.stem(word)
            tokened_words.append(stemmed)

    return tokened_words

#this only removes stop words
def clean_nostop(text):
    tokens = word_tokenize(text.lower())
    token_words =[]
    for word in tokens:
        if word not in sws:
            token_words.append(word)

    return token_words

#this only removes stop words
def clean_nopunc(text):
    tokens = word_tokenize(text.lower())
    token_words =[]
    for word in tokens:
        if word not in punc:
            token_words.append(word)

    return token_words

#this only removes stop words
def clean_juststem(text):
    tokens = word_tokenize(text.lower())
    token_words =[]
    for word in tokens:
        stemmed = stemmer.stem(word)
        token_words.append(stemmed)

    return token_words

df_main['stemmed'] = [clean_juststem(text) for text in df_main['content'].dropna()]
df_main['punc'] = [clean_nopunc(text) for text in df_main['content'].dropna()]
df_main['stop'] = [clean_nostop(text) for text in df_main['content'].dropna()]
df_main['cleaned'] = [clean_tokens(text) for text in df_main['content'].dropna()]


df_main.head()


Unnamed: 0,college,content,nrefs,nlinks,stemmed,punc,stop,cleaned
0,Air Force Institute of Technology,The Air Force Institute of Technology (AFIT) i...,142,253,"[the, air, forc, institut, of, technolog, (, a...","[the, air, force, institute, of, technology, a...","[air, force, institute, technology, (, afit, )...","[afit, postgradu, institut, provid, profession..."
1,Albert Einstein College of Medicine,The Albert Einstein College of Medicine is a p...,88,234,"[the, albert, einstein, colleg, of, medicin, i...","[the, albert, einstein, college, of, medicine,...","[albert, einstein, college, medicine, private,...","[privat, found, 1953, oper, independ, degree-g..."
2,American University,The American University (AU or American) is a ...,148,423,"[the, american, univers, (, au, or, american, ...","[the, american, university, au, or, american, ...","[american, university, (, au, american, ), pri...","[privat, feder, charter, research, d.c., main,..."
3,Arizona State University,Arizona State University (Arizona State or ASU...,515,880,"[arizona, state, univers, (, arizona, state, o...","[arizona, state, university, arizona, state, o...","[arizona, state, university, (, arizona, state...","[public, research, phoenix, metropolitan, area..."
4,Arkansas State University,Arkansas State University (A-State or ASU) is ...,109,281,"[arkansa, state, univers, (, a-stat, or, asu, ...","[arkansas, state, university, a-state, or, asu...","[arkansas, state, university, (, a-state, asu,...","[a-stat, public, research, jonesboro, flagship..."


<h3>Beginning LDA</h3>

In [6]:
from gensim import corpora, models

<h3>LDA without stopwords, punctuation or stemming</h3>

In [7]:
#Just stopwords
dictionary = corpora.Dictionary(df_main['stop'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['stop']]
ldamodel_stop = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel_stop.print_topics(num_topics=5, num_words=5)

[(0,
  '0.004*"michigan" + 0.003*"penn" + 0.002*"ohio" + 0.002*"miami" + 0.002*"illinois"'),
 (1,
  '0.009*"georgia" + 0.005*"york" + 0.004*"duke" + 0.003*"cornell" + 0.003*"carolina"'),
 (2,
  '0.005*"san" + 0.004*"california" + 0.003*"brown" + 0.003*"columbia" + 0.003*"mit"'),
 (3,
  '0.015*"texas" + 0.010*"florida" + 0.006*"medicine" + 0.004*"hospital" + 0.003*"yale"'),
 (4,
  '0.010*"tech" + 0.007*"virginia" + 0.006*"boston" + 0.004*"louisiana" + 0.003*"massachusetts"')]

In [8]:
#Just punctuation
dictionary = corpora.Dictionary(df_main['punc'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['punc']]
ldamodel_punc = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel_punc.print_topics(num_topics=5, num_words=5)

[(0,
  '0.007*"georgia" + 0.005*"penn" + 0.004*"york" + 0.004*"medicine" + 0.003*"cornell"'),
 (1,
  '0.011*"texas" + 0.004*"california" + 0.004*"san" + 0.004*"boston" + 0.004*"uc"'),
 (2,
  '0.006*"michigan" + 0.004*"illinois" + 0.003*"washington" + 0.003*"chicago" + 0.003*"duke"'),
 (3,
  '0.009*"florida" + 0.005*"virginia" + 0.004*"miami" + 0.003*"ohio" + 0.003*"medicine"'),
 (4,
  '0.010*"tech" + 0.004*"yale" + 0.003*"arizona" + 0.003*"louisiana" + 0.003*"rice"')]

In [9]:
#Just stemming
dictionary = corpora.Dictionary(df_main['stemmed'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['stemmed']]
ldamodel_stem = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel_stem.print_topics(num_topics=5, num_words=5)

[(0,
  '0.006*"boston" + 0.005*"carolina" + 0.005*"miami" + 0.004*"maryland" + 0.003*"massachusett"'),
 (1,
  '0.014*"florida" + 0.011*"georgia" + 0.008*"medicin" + 0.007*"california" + 0.007*"hospit"'),
 (2,
  '0.006*"penn" + 0.005*"washington" + 0.004*"virginia" + 0.004*"duke" + 0.004*"york"'),
 (3,
  '0.004*"columbia" + 0.004*"york" + 0.003*"mississippi" + 0.003*"southern" + 0.003*"arkansa"'),
 (4,
  '0.010*"texa" + 0.005*"tech" + 0.005*"michigan" + 0.004*"agricultur" + 0.003*"illinoi"')]

In [10]:
#Over the cleaned text last so the dictionary is correct for the sentiment analysis
dictionary = corpora.Dictionary(df_main['cleaned'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['cleaned']]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel.print_topics(num_topics=5, num_words=5)

[(0,
  '0.005*"player" + 0.004*"museum" + 0.003*"black" + 0.003*"percent" + 0.003*"band"'),
 (1,
  '0.005*"prize" + 0.004*"nobel" + 0.003*"truste" + 0.003*"architectur" + 0.003*"societi"'),
 (2,
  '0.003*"season" + 0.003*"stadium" + 0.003*"tournament" + 0.003*"architectur" + 0.002*"recreat"'),
 (3,
  '0.012*"hospit" + 0.006*"clinic" + 0.005*"care" + 0.005*"cancer" + 0.005*"chancellor"'),
 (4,
  '0.004*"caltech" + 0.003*"truste" + 0.003*"societi" + 0.003*"born" + 0.002*"jesuit"')]

<h2>Sentiment Analysis</h2>

In [11]:
%matplotlib inline
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.corpus import stopwords

In [12]:
#Doing analysis on original text content
comp = []

for text in df_main['content']:
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    comp.append(ss['compound'])

df_main['compound'] = comp
df_main.head()

Unnamed: 0,college,content,nrefs,nlinks,stemmed,punc,stop,cleaned,compound
0,Air Force Institute of Technology,The Air Force Institute of Technology (AFIT) i...,142,253,"[the, air, forc, institut, of, technolog, (, a...","[the, air, force, institute, of, technology, a...","[air, force, institute, technology, (, afit, )...","[afit, postgradu, institut, provid, profession...",0.9971
1,Albert Einstein College of Medicine,The Albert Einstein College of Medicine is a p...,88,234,"[the, albert, einstein, colleg, of, medicin, i...","[the, albert, einstein, college, of, medicine,...","[albert, einstein, college, medicine, private,...","[privat, found, 1953, oper, independ, degree-g...",0.9973
2,American University,The American University (AU or American) is a ...,148,423,"[the, american, univers, (, au, or, american, ...","[the, american, university, au, or, american, ...","[american, university, (, au, american, ), pri...","[privat, feder, charter, research, d.c., main,...",0.9992
3,Arizona State University,Arizona State University (Arizona State or ASU...,515,880,"[arizona, state, univers, (, arizona, state, o...","[arizona, state, university, arizona, state, o...","[arizona, state, university, (, arizona, state...","[public, research, phoenix, metropolitan, area...",1.0
4,Arkansas State University,Arkansas State University (A-State or ASU) is ...,109,281,"[arkansa, state, univers, (, a-stat, or, asu, ...","[arkansas, state, university, a-state, or, asu...","[arkansas, state, university, (, a-state, asu,...","[a-stat, public, research, jonesboro, flagship...",0.9956


In [13]:
#need to add a string column not just a list of object
def string_cleaned(word_list):
    return ' '.join(word_list)

df_main['cleaned string'] = df_main['cleaned'].apply(string_cleaned)

In [14]:
#adding top topics to dataframe
def get_top_topic(c):
    c = dictionary.doc2bow(nltk.word_tokenize(c))
    topics = ldamodel.get_document_topics(c)
    top_topic = sorted(topics, key=lambda x: -x[1])[0][0]
    return top_topic
    
df_main['topic'] = df_main['cleaned string'].apply(get_top_topic)

df_main.head()

Unnamed: 0,college,content,nrefs,nlinks,stemmed,punc,stop,cleaned,compound,cleaned string,topic
0,Air Force Institute of Technology,The Air Force Institute of Technology (AFIT) i...,142,253,"[the, air, forc, institut, of, technolog, (, a...","[the, air, force, institute, of, technology, a...","[air, force, institute, technology, (, afit, )...","[afit, postgradu, institut, provid, profession...",0.9971,afit postgradu institut provid profession cont...,3
1,Albert Einstein College of Medicine,The Albert Einstein College of Medicine is a p...,88,234,"[the, albert, einstein, colleg, of, medicin, i...","[the, albert, einstein, college, of, medicine,...","[albert, einstein, college, medicine, private,...","[privat, found, 1953, oper, independ, degree-g...",0.9973,privat found 1953 oper independ degree-gr inst...,3
2,American University,The American University (AU or American) is a ...,148,423,"[the, american, univers, (, au, or, american, ...","[the, american, university, au, or, american, ...","[american, university, (, au, american, ), pri...","[privat, feder, charter, research, d.c., main,...",0.9992,privat feder charter research d.c. main campu ...,2
3,Arizona State University,Arizona State University (Arizona State or ASU...,515,880,"[arizona, state, univers, (, arizona, state, o...","[arizona, state, university, arizona, state, o...","[arizona, state, university, (, arizona, state...","[public, research, phoenix, metropolitan, area...",1.0,public research phoenix metropolitan area foun...,0
4,Arkansas State University,Arkansas State University (A-State or ASU) is ...,109,281,"[arkansa, state, univers, (, a-stat, or, asu, ...","[arkansas, state, university, a-state, or, asu...","[arkansas, state, university, (, a-state, asu,...","[a-stat, public, research, jonesboro, flagship...",0.9956,a-stat public research jonesboro flagship camp...,0


<h3>Comparing topics and Compound Scores</h3>

In [16]:
#looking to see if average sentiment analysis and topic have any correlation
df_main[["topic", "compound", "nrefs"]].groupby("topic").mean()

Unnamed: 0_level_0,compound,nrefs
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.961946,202.037037
1,0.997595,364.410256
2,0.968114,174.65625
3,0.929492,108.444444
4,0.995328,175.206897


In [24]:
for num,topics in ldamodel.show_topics(num_topics=5, num_words=5, formatted=False):
    print("Topic", num)
    print("Words:")
    for keyword in topics:
        print(keyword[0])
    print()


Topic 0
Words:
player
museum
black
percent
band

Topic 1
Words:
prize
nobel
truste
architectur
societi

Topic 2
Words:
season
stadium
tournament
architectur
recreat

Topic 3
Words:
hospit
clinic
care
cancer
chancellor

Topic 4
Words:
caltech
truste
societi
born
jesuit



<h3>Sampling a Subset</h3>

In [27]:
df_subset = df_main.sample(50)

dictionary_ss = corpora.Dictionary(df_subset['cleaned'])
dictionary_ss.filter_extremes(no_below=5, no_above=0.6)
dictionary_ss.compactify()

corpus = [dictionary_ss.doc2bow(text) for text in df_subset['cleaned']]
ldamodel_ss = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary_ss, passes=20)
ldamodel_ss.print_topics(num_topics=5, num_words=5)

[(0,
  '0.007*"stadium" + 0.004*"museum" + 0.004*"coach" + 0.004*"fratern" + 0.004*"tiger"'),
 (1,
  '0.006*"knight" + 0.005*"museum" + 0.005*"competit" + 0.004*"total" + 0.004*"memori"'),
 (2,
  '0.007*"rose" + 0.005*"lincoln" + 0.003*"write" + 0.003*"robert" + 0.003*"leagu"'),
 (3,
  '0.009*"hospit" + 0.009*"nurs" + 0.006*"clinic" + 0.005*"pharmaci" + 0.005*"biolog"'),
 (4,
  '0.009*"architectur" + 0.007*"band" + 0.005*"tiger" + 0.005*"hockey" + 0.004*"danc"')]

In [28]:
df_subset[["topic", "compound", "nrefs"]].groupby("topic").mean()

Unnamed: 0_level_0,compound,nrefs
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.999487,178.875
1,1.0,429.0
2,0.923638,172.692308
3,0.998167,158.444444
4,0.99955,191.25
