<h1>Homework 4 - Zachary Jernigan</h1>

<h3>Loading data and implementing packages</h3>

In [1]:
import pandas as pd
import json

In [2]:
with open('universities.json','r') as f:
    raw_data = json.load(f)

df_main = pd.DataFrame(raw_data)

df_main.head()

Unnamed: 0,college,content,nrefs,nlinks
0,Air Force Institute of Technology,The Air Force Institute of Technology (AFIT) i...,142,253
1,Albert Einstein College of Medicine,The Albert Einstein College of Medicine is a p...,88,234
2,American University,The American University (AU or American) is a ...,148,423
3,Arizona State University,Arizona State University (Arizona State or ASU...,515,880
4,Arkansas State University,Arkansas State University (A-State or ASU) is ...,109,281


<h3>Starting Analysis with Preprocessing</h3>

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import string

In [4]:
# these lines download the models and really only need to be run once ever
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('maxent_ne_chunker')
nltk.download('words') 
nltk.download('treebank')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/jovyan/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk

True

In [14]:
sws = set(stopwords.words('english'))
stemmer = PorterStemmer()
punc = set(string.punctuation)

#These words were added later on because they kept skewing the data away from topics and toward repeated words
custom_filter = set(['university', 'college', 'institute', 'state', 'school', 'penn','philadelphia','ucla','umass','pitt'])

#I borrowed this code from stack overflow instead of writing it myself
states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"]
lowercase_states = [state.lower() for state in states]

#want to remove repeated references to name of uni
uni_names = set()
for name in df_main['college']:
    split_name = name.lower().split()
    uni_names.update(split_name)


#Given a text, 
#1. tokenize the lowercase words
#2. for all the words that were tokenized look for only words in not stopwords or punctutation
# I later added removal of ==== and included len(word)>3 to remove abbreviations of unis like usc
#3. stem word then place into list. Return list of processed words
def clean_tokens(text):
    tokens = word_tokenize(text.lower())
    tokened_words = []

    for word in tokens:
        if word not in sws and word not in punc and word not in custom_filter and word not in uni_names and word != "====" and len(word)>3 and word not in lowercase_states:
            stemmed = stemmer.stem(word)
            tokened_words.append(stemmed)

    return tokened_words

#this only removes stop words
def clean_nostop(text):
    tokens = word_tokenize(text.lower())
    token_words =[]
    for word in tokens:
        if word not in sws:
            token_words.append(word)

    return token_words

#this only removes stop words
def clean_nopunc(text):
    tokens = word_tokenize(text.lower())
    token_words =[]
    for word in tokens:
        if word not in punc:
            token_words.append(word)

    return token_words

#this only removes stop words
def clean_juststem(text):
    tokens = word_tokenize(text.lower())
    token_words =[]
    for word in tokens:
        stemmed = stemmer.stem(word)
        token_words.append(stemmed)

    return token_words

df_main['stemmed'] = [clean_juststem(text) for text in df_main['content'].dropna()]
df_main['punc'] = [clean_nopunc(text) for text in df_main['content'].dropna()]
df_main['stop'] = [clean_nostop(text) for text in df_main['content'].dropna()]
df_main['cleaned'] = [clean_tokens(text) for text in df_main['content'].dropna()]


df_main.head()


Unnamed: 0,college,content,nrefs,nlinks,stemmed,no punc,no stop,cleaned,punc,stop
0,Air Force Institute of Technology,The Air Force Institute of Technology (AFIT) i...,142,253,"[the, air, forc, institut, of, technolog, (, a...","[the, air, force, institute, of, technology, a...","[air, force, institute, technology, (, afit, )...","[afit, postgradu, institut, provid, profession...","[the, air, force, institute, of, technology, a...","[air, force, institute, technology, (, afit, )..."
1,Albert Einstein College of Medicine,The Albert Einstein College of Medicine is a p...,88,234,"[the, albert, einstein, colleg, of, medicin, i...","[the, albert, einstein, college, of, medicine,...","[albert, einstein, college, medicine, private,...","[privat, found, 1953, oper, independ, degree-g...","[the, albert, einstein, college, of, medicine,...","[albert, einstein, college, medicine, private,..."
2,American University,The American University (AU or American) is a ...,148,423,"[the, american, univers, (, au, or, american, ...","[the, american, university, au, or, american, ...","[american, university, (, au, american, ), pri...","[privat, feder, charter, research, d.c., main,...","[the, american, university, au, or, american, ...","[american, university, (, au, american, ), pri..."
3,Arizona State University,Arizona State University (Arizona State or ASU...,515,880,"[arizona, state, univers, (, arizona, state, o...","[arizona, state, university, arizona, state, o...","[arizona, state, university, (, arizona, state...","[public, research, phoenix, metropolitan, area...","[arizona, state, university, arizona, state, o...","[arizona, state, university, (, arizona, state..."
4,Arkansas State University,Arkansas State University (A-State or ASU) is ...,109,281,"[arkansa, state, univers, (, a-stat, or, asu, ...","[arkansas, state, university, a-state, or, asu...","[arkansas, state, university, (, a-state, asu,...","[a-stat, public, research, jonesboro, flagship...","[arkansas, state, university, a-state, or, asu...","[arkansas, state, university, (, a-state, asu,..."


<h3>Beginning LDA</h3>

In [6]:
from gensim import corpora, models

In [12]:
#Starting cleaned text first this will be the baseline
dictionary = corpora.Dictionary(df_main['cleaned'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['cleaned']]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel.print_topics(num_topics=5, num_words=5)

[(0,
  '0.007*"baptist" + 0.004*"macon" + 0.004*"armi" + 0.004*"militari" + 0.003*"navi"'),
 (1,
  '0.004*"player" + 0.004*"alpha" + 0.003*"sigma" + 0.003*"band" + 0.003*"chancellor"'),
 (2,
  '0.003*"stadium" + 0.003*"museum" + 0.003*"season" + 0.002*"sustain" + 0.002*"percent"'),
 (3,
  '0.013*"hospit" + 0.006*"clinic" + 0.005*"cancer" + 0.005*"care" + 0.005*"biomed"'),
 (4,
  '0.003*"scholar" + 0.003*"prize" + 0.003*"truste" + 0.003*"societi" + 0.003*"nobel"')]

<h3>LDA without stopwords, punctuation or stemming</h3>

In [17]:
#Just stopwords
dictionary = corpora.Dictionary(df_main['stop'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['stop']]
ldamodel_stop = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel_stop.print_topics(num_topics=5, num_words=5)

[(0,
  '0.012*"texas" + 0.008*"florida" + 0.003*"yale" + 0.003*"medicine" + 0.003*"colorado"'),
 (1,
  '0.010*"georgia" + 0.005*"miami" + 0.004*"ohio" + 0.004*"duke" + 0.004*"maryland"'),
 (2,
  '0.009*"tech" + 0.005*"penn" + 0.005*"medicine" + 0.005*"virginia" + 0.003*"cornell"'),
 (3,
  '0.005*"san" + 0.004*"boston" + 0.004*"york" + 0.003*"california" + 0.003*"columbia"'),
 (4,
  '0.004*"michigan" + 0.003*"illinois" + 0.003*"washington" + 0.003*"chicago" + 0.002*"player"')]

In [18]:
#Just punctuation
dictionary = corpora.Dictionary(df_main['punc'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['punc']]
ldamodel_punc = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel_punc.print_topics(num_topics=5, num_words=5)

[(0,
  '0.014*"texas" + 0.005*"medicine" + 0.005*"michigan" + 0.004*"carolina" + 0.003*"hospital"'),
 (1,
  '0.009*"florida" + 0.005*"california" + 0.005*"uc" + 0.004*"miami" + 0.003*"maryland"'),
 (2,
  '0.009*"tech" + 0.008*"georgia" + 0.004*"illinois" + 0.004*"southern" + 0.003*"duke"'),
 (3,
  '0.004*"princeton" + 0.003*"usc" + 0.003*"ohio" + 0.003*"mit" + 0.003*"diego"'),
 (4,
  '0.004*"boston" + 0.004*"virginia" + 0.004*"penn" + 0.003*"yale" + 0.003*"washington"')]

In [19]:
#Just stemming
dictionary = corpora.Dictionary(df_main['stemmed'])
dictionary.filter_extremes(no_below=5, no_above=0.6)
dictionary.compactify()

corpus = [dictionary.doc2bow(text) for text in df_main['stemmed']]
ldamodel_stem = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=20)
ldamodel_stem.print_topics(num_topics=5, num_words=5)

[(0,
  '0.010*"georgia" + 0.006*"uc" + 0.005*"york" + 0.003*"san" + 0.003*"cornel"'),
 (1,
  '0.011*"texa" + 0.004*"virginia" + 0.003*"m" + 0.003*"yale" + 0.003*"princeton"'),
 (2,
  '0.009*"penn" + 0.006*"carolina" + 0.006*"duke" + 0.006*"california" + 0.004*"purdu"'),
 (3,
  '0.007*"medicin" + 0.006*"michigan" + 0.006*"hospit" + 0.004*"miami" + 0.004*"clinic"'),
 (4,
  '0.009*"florida" + 0.008*"tech" + 0.004*"illinoi" + 0.003*"louisiana" + 0.003*"southern"')]

<h2>Sentiment Analysis</h2>

In [13]:
%matplotlib inline
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('maxent_ne_chunker')
nltk.download('words') 
nltk.download('treebank')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to /home/jovyan/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True