## Find important words in research articles about school violence

In [2]:
import pandas as pd
import nltk
import numpy as np
%matplotlib inline  
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
from operator import itemgetter

In [3]:
#Take one txt file as an example
example_file = open('data/d1.txt')

In [4]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [5]:
#Parse by sentences
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = example_file.read()
docs = tokenizer.tokenize(data)

In [6]:
#Clean the input
stop = set(stopwords.words('english'))
stop.update(["this","these","many","may",'we','in','of','to','And','It',"It's","The",'the','abstract','background','them', '1','2','3','4','5','6','7','8','9'])
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i.lower() for i in str(doc).split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [7]:
#build dictionary and term matrix
doc_clean = [clean(doc).split() for doc in docs]
dictionary = corpora.Dictionary(doc_clean)
dictionary.filter_extremes(no_above=0.1)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

#Words in the dictionary sorted by the number of time it appeared in the doc
sorted_dict = sorted(dictionary.token2id.items(), key=lambda x: x[1], reverse = True)
sorted_dict

[(u'con', 141),
 (u'2008', 140),
 (u'time', 139),
 (u'u', 138),
 (u'without', 137),
 (u'analysis', 136),
 (u'alcohol', 135),
 (u'response', 134),
 (u'data', 133),
 (u'included', 132),
 (u'significant', 131),
 (u'assignment', 130),
 (u'higher', 129),
 (u'intention', 128),
 (u'finding', 127),
 (u'connectedness', 126),
 (u'5', 125),
 (u'high', 124),
 (u'1', 123),
 (u'nonparticipant', 122),
 (u'grant', 121),
 (u'baseline', 120),
 (u'participant', 119),
 (u'education', 118),
 (u'in', 117),
 (u'sample', 116),
 (u'evidence', 115),
 (u'al', 114),
 (u'measure', 113),
 (u'nearest', 112),
 (u'district\u2019s', 111),
 (u'similar', 110),
 (u'toward', 109),
 (u'following', 108),
 (u'9th', 107),
 (u'three', 106),
 (u'future', 105),
 (u'value', 104),
 (u'likely', 103),
 (u'measured', 102),
 (u'j', 101),
 (u'whether', 100),
 (u'16', 99),
 (u'14', 98),
 (u'12', 97),
 (u'account', 96),
 (u'10', 95),
 (u'spillover', 94),
 (u'extracurricular', 93),
 (u'tested', 92),
 (u'students\u2019', 91),
 (u'followup',

In [8]:
#tfidf using gensim package
tfidf = gensim.models.tfidfmodel.TfidfModel(doc_term_matrix, id2word = dictionary)

In [9]:
#Fit corpus to get scores for terms in dictionary
corpus_tfidf = tfidf[doc_term_matrix]
word_score = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc}
#print(word_score)

In [10]:
#Sort by term scores
import operator
sorted_word_score = sorted(word_score.items(), key=operator.itemgetter(1),reverse=True)
sorted_word_score

[(u'4', 1.0),
 (u'policy', 1.0),
 (u'p', 1.0),
 (u'research', 1.0),
 (u'12', 1.0),
 (u'j', 1.0),
 (u'measure', 1.0),
 (u'in', 1.0),
 (u'5', 1.0),
 (u'education', 0.9097298558352529),
 (u'2', 0.8397540032818169),
 (u'whether', 0.8239745925257059),
 (u'2008', 0.7753567334050402),
 (u'using', 0.7328280175142081),
 (u'10', 0.7328280175142081),
 (u'second', 0.7071067811865476),
 (u'likely', 0.6809199143253225),
 (u'random', 0.6804139157499614),
 (u'14', 0.6528847313856487),
 (u'period', 0.6521462389505934),
 (u'range', 0.6395055420741893),
 (u'health', 0.631523503888387),
 (u'3', 0.6299769056354008),
 (u'statistically', 0.6219058663282299),
 (u'6', 0.6140925832335032),
 (u'1', 0.6035420554991026),
 (u'significant', 0.6018729660222095),
 (u'survey', 0.5905258087194626),
 (u'possible', 0.5624385926132234),
 (u'measured', 0.5584477984793236),
 (u'grant', 0.5544855398289232),
 (u'greater', 0.5477436977084504),
 (u'spillover', 0.5455247563286337),
 (u'time', 0.5365154351260035),
 (u'et', 0.53560

In [11]:
#tfidf using sklearn package
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stop)
tfidf_matrix =  tf.fit_transform(docs)
feature_names = tf.get_feature_names() 
len(feature_names)

6146

In [12]:
#print(feature_names)

In [13]:
#sort term scores
dense = tfidf_matrix.todense()
sentences = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(sentences)), sentences) if pair[1] > 0]
#sorted(phrase_scores, key=lambda t: t[1] * -1)

In [14]:
#Match term ID with term
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
    print('{0: <20} {1}'.format(phrase, score))

## Combine files for each type of violence

In [71]:
#Fight_assault_data
from io import open
import glob
path =r'/Users/aarya/Github/Lantern_v2/Data/SH_data'
allFiles = glob.glob(path + "/*.txt")
text = ''
for file in allFiles:
    with open(file, encoding='utf-8') as f:
        text = text + str(f.read())

In [72]:
texts = tokenizer.tokenize(text.decode('utf-8')) # add the decode function here to fix any unicodedecode errors
texts_clean = [clean(doc).split() for doc in texts]
dictionary = corpora.Dictionary(texts_clean)
dictionary.filter_extremes(no_above=0.5)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in texts_clean]

#Words in the dictionary sorted by the number of time it appeared in the doc
sorted_dict = sorted(dictionary.token2id.items(), key=lambda x: x[1], reverse = True)
sorted_dict

[(u'simpson', 1898),
 (u'rural', 1897),
 (u'471', 1896),
 (u'intentionally', 1895),
 (u'eom', 1894),
 (u'prevalent', 1893),
 (u'communitybased', 1892),
 (u'exposure', 1891),
 (u'gap', 1890),
 (u'chiodo', 1889),
 (u'chance', 1888),
 (u'variation', 1887),
 (u'conclusion', 1886),
 (u'concerning', 1885),
 (u'mass', 1884),
 (u'other', 1883),
 (u'41', 1882),
 (u'40', 1881),
 (u'detail', 1880),
 (u'42', 1879),
 (u'45', 1878),
 (u'44', 1877),
 (u'47', 1876),
 (u'46', 1875),
 (u'variety', 1874),
 (u'48', 1873),
 (u'compared', 1872),
 (u'inc', 1871),
 (u'girl', 1870),
 (u'education', 1869),
 (u'83', 1868),
 (u'sexually', 1867),
 (u'problem', 1866),
 (u'men\u2019s', 1865),
 (u'89', 1864),
 (u'88', 1863),
 (u'official', 1862),
 (u'whether', 1861),
 (u'jaffe', 1860),
 (u'made', 1859),
 (u'he', 1858),
 (u'highest', 1857),
 (u'hierarchical', 1856),
 (u'students\u2019', 1855),
 (u'but', 1854),
 (u'bus', 1853),
 (u'north', 1852),
 (u'function', 1851),
 (u'recruited', 1850),
 (u'spread', 1849),
 (u'1988

In [73]:
#tfidf using gensim package
tfidf = gensim.models.tfidfmodel.TfidfModel(doc_term_matrix, id2word = dictionary)
corpus_tfidf = tfidf[doc_term_matrix]
word_score = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc}
sorted_word_score = sorted(word_score.items(), key=operator.itemgetter(1),reverse=True)
sorted_word_score

[(u'1995', 1.0),
 (u'1997', 1.0),
 (u'1996', 1.0),
 (u'swearer', 1.0),
 (u'2000a', 1.0),
 (u'1979', 1.0),
 (u'221', 1.0),
 (u'ma', 1.0),
 (u'college', 1.0),
 (u'on', 1.0),
 (u'u', 1.0),
 (u'kt', 1.0),
 (u'theft', 1.0),
 (u'pp', 1.0),
 (u'9', 1.0),
 (u'2009', 1.0),
 (u'leo', 1.0),
 (u'2013', 1.0),
 (u'simpson', 1.0),
 (u'keywords', 0.9449657776175251),
 (u'111', 0.9103203810070124),
 (u'concept', 0.8966035023729834),
 (u'hand', 0.891468077040608),
 (u'51', 0.8712730356946504),
 (u'society', 0.8653772638810109),
 (u'pearson', 0.8616299429840025),
 (u'eom', 0.8453013098323015),
 (u'adolesc', 0.828276063954829),
 (u'control', 0.8240949728655488),
 (u'288', 0.8135212482458828),
 (u'ca', 0.8119551424863626),
 (u'290', 0.8060820939046603),
 (u'chicago', 0.8060820939046603),
 (u'witkowska', 0.805638299892356),
 (u'good', 0.8011206398531133),
 (u'success', 0.8011206398531133),
 (u'vulnerable', 0.7969116826338224),
 (u'2002', 0.795900701074588),
 (u'attarschwartz', 0.7931922752893724),
 (u'8', 0

In [74]:
#tfidf using sklearn package
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 7, stop_words = stop)
tfidf_matrix =  tf.fit_transform(texts)
feature_names = tf.get_feature_names() 
len(feature_names)

1419

In [75]:
#sort term scores
dense = tfidf_matrix.todense()
sentences = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(sentences)), sentences) if pair[1] > 0]
sorted(phrase_scores, key=lambda t: t[1] * -1)

[(964, 0.4021139627019474),
 (232, 0.309996392008725),
 (898, 0.28060740912220555),
 (1372, 0.24780947915884102),
 (545, 0.2415234569431377),
 (989, 0.22976042398078805),
 (1014, 0.22077647826585498),
 (859, 0.21350623305727703),
 (773, 0.21277753161132798),
 (657, 0.20080580954225757),
 (1185, 0.19064654900343345),
 (276, 0.1896550757950809),
 (940, 0.1888743207528982),
 (33, 0.18183513825274678),
 (1063, 0.17700913641018537),
 (906, 0.17507522514044835),
 (207, 0.17349200600051504),
 (668, 0.1576375644741868),
 (1253, 0.15106331789062177),
 (1250, 0.13233532291785108),
 (1152, 0.11265257143775669)]

In [76]:
#Match term ID with term
kwords_dict_3 = {}
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores]:
    kwords_dict_3[phrase] = score
kwords_dict_3

{u'12': 0.18183513825274678,
 u'adolescents': 0.17349200600051504,
 u'among': 0.309996392008725,
 u'based': 0.1896550757950809,
 u'examine': 0.2415234569431377,
 u'harassment': 0.20080580954225757,
 u'high': 0.1576375644741868,
 u'large': 0.21277753161132798,
 u'months': 0.21350623305727703,
 u'occurring': 0.28060740912220555,
 u'one': 0.17507522514044835,
 u'past': 0.1888743207528982,
 u'perpetration': 0.4021139627019474,
 u'population': 0.22976042398078805,
 u'prevalence': 0.22077647826585498,
 u'rates': 0.17700913641018537,
 u'school': 0.11265257143775669,
 u'sexual': 0.19064654900343345,
 u'students': 0.13233532291785108,
 u'study': 0.15106331789062177,
 u'victimization': 0.24780947915884102}

In [77]:
dict_list = [kwords_dict_1, kwords_dict_2, kwords_dict_3, kwords_dict_4, kwords_dict_5]

In [78]:
cat_list = ['Bully', 'Fight_Assault', 'Sexual Harassment', 'Drug', 'Vandalism']
main_keys = dict(zip(cat_list, dict_list))

In [79]:
main_keys['Sexual Harassment']

{u'12': 0.18183513825274678,
 u'adolescents': 0.17349200600051504,
 u'among': 0.309996392008725,
 u'based': 0.1896550757950809,
 u'examine': 0.2415234569431377,
 u'harassment': 0.20080580954225757,
 u'high': 0.1576375644741868,
 u'large': 0.21277753161132798,
 u'months': 0.21350623305727703,
 u'occurring': 0.28060740912220555,
 u'one': 0.17507522514044835,
 u'past': 0.1888743207528982,
 u'perpetration': 0.4021139627019474,
 u'population': 0.22976042398078805,
 u'prevalence': 0.22077647826585498,
 u'rates': 0.17700913641018537,
 u'school': 0.11265257143775669,
 u'sexual': 0.19064654900343345,
 u'students': 0.13233532291785108,
 u'study': 0.15106331789062177,
 u'victimization': 0.24780947915884102}

In [80]:
import json
json.dump(main_keys, open('main_keys', 'wb'))