## Find important words in research articles about school violence

In [2]:
import pandas as pd
import nltk
import numpy as np
%matplotlib inline  
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
from operator import itemgetter

In [3]:
#Take one txt file as an example
example_file = open('data/d1.txt')

In [4]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [5]:
#Parse by sentences
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = example_file.read()
docs = tokenizer.tokenize(data)

In [6]:
#Clean the input
stop = set(stopwords.words('english'))
stop.update(["this","these","many","may",'we','in','of','to','And','It',"It's","The",'the','abstract','background','them', '1','2','3','4','5','6','7','8','9'])
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i.lower() for i in str(doc).split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [7]:
#build dictionary and term matrix
doc_clean = [clean(doc).split() for doc in docs]
dictionary = corpora.Dictionary(doc_clean)
dictionary.filter_extremes(no_above=0.1)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

#Words in the dictionary sorted by the number of time it appeared in the doc
sorted_dict = sorted(dictionary.token2id.items(), key=lambda x: x[1], reverse = True)
sorted_dict

[(u'con', 141),
 (u'2008', 140),
 (u'time', 139),
 (u'u', 138),
 (u'without', 137),
 (u'analysis', 136),
 (u'alcohol', 135),
 (u'response', 134),
 (u'data', 133),
 (u'included', 132),
 (u'significant', 131),
 (u'assignment', 130),
 (u'higher', 129),
 (u'intention', 128),
 (u'finding', 127),
 (u'connectedness', 126),
 (u'5', 125),
 (u'high', 124),
 (u'1', 123),
 (u'nonparticipant', 122),
 (u'grant', 121),
 (u'baseline', 120),
 (u'participant', 119),
 (u'education', 118),
 (u'in', 117),
 (u'sample', 116),
 (u'evidence', 115),
 (u'al', 114),
 (u'measure', 113),
 (u'nearest', 112),
 (u'district\u2019s', 111),
 (u'similar', 110),
 (u'toward', 109),
 (u'following', 108),
 (u'9th', 107),
 (u'three', 106),
 (u'future', 105),
 (u'value', 104),
 (u'likely', 103),
 (u'measured', 102),
 (u'j', 101),
 (u'whether', 100),
 (u'16', 99),
 (u'14', 98),
 (u'12', 97),
 (u'account', 96),
 (u'10', 95),
 (u'spillover', 94),
 (u'extracurricular', 93),
 (u'tested', 92),
 (u'students\u2019', 91),
 (u'followup',

In [8]:
#tfidf using gensim package
tfidf = gensim.models.tfidfmodel.TfidfModel(doc_term_matrix, id2word = dictionary)

In [9]:
#Fit corpus to get scores for terms in dictionary
corpus_tfidf = tfidf[doc_term_matrix]
word_score = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc}
#print(word_score)

In [10]:
#Sort by term scores
import operator
sorted_word_score = sorted(word_score.items(), key=operator.itemgetter(1),reverse=True)
sorted_word_score

[(u'4', 1.0),
 (u'policy', 1.0),
 (u'p', 1.0),
 (u'research', 1.0),
 (u'12', 1.0),
 (u'j', 1.0),
 (u'measure', 1.0),
 (u'in', 1.0),
 (u'5', 1.0),
 (u'education', 0.9097298558352529),
 (u'2', 0.8397540032818169),
 (u'whether', 0.8239745925257059),
 (u'2008', 0.7753567334050402),
 (u'using', 0.7328280175142081),
 (u'10', 0.7328280175142081),
 (u'second', 0.7071067811865476),
 (u'likely', 0.6809199143253225),
 (u'random', 0.6804139157499614),
 (u'14', 0.6528847313856487),
 (u'period', 0.6521462389505934),
 (u'range', 0.6395055420741893),
 (u'health', 0.631523503888387),
 (u'3', 0.6299769056354008),
 (u'statistically', 0.6219058663282299),
 (u'6', 0.6140925832335032),
 (u'1', 0.6035420554991026),
 (u'significant', 0.6018729660222095),
 (u'survey', 0.5905258087194626),
 (u'possible', 0.5624385926132234),
 (u'measured', 0.5584477984793236),
 (u'grant', 0.5544855398289232),
 (u'greater', 0.5477436977084504),
 (u'spillover', 0.5455247563286337),
 (u'time', 0.5365154351260035),
 (u'et', 0.53560

In [11]:
#tfidf using sklearn package
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stop)
tfidf_matrix =  tf.fit_transform(docs)
feature_names = tf.get_feature_names() 
len(feature_names)

6146

In [12]:
#print(feature_names)

In [13]:
#sort term scores
dense = tfidf_matrix.todense()
sentences = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(sentences)), sentences) if pair[1] > 0]
#sorted(phrase_scores, key=lambda t: t[1] * -1)

In [14]:
#Match term ID with term
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
    print('{0: <20} {1}'.format(phrase, score))

## Combine files for each type of violence

In [58]:
#Fight_assault_data
from io import open
import glob
path =r'/Users/aarya/Github/Lantern_v2/Data/fight_assault_data'
allFiles = glob.glob(path + "/*.txt")
text = ''
for file in allFiles:
    with open(file, encoding='latin-1') as f:
        text = text + str(f.read())

In [59]:
texts = tokenizer.tokenize(text.decode('utf-8')) # add the decode function here to fix any unicodedecode errors
texts_clean = [clean(doc).split() for doc in texts]
dictionary = corpora.Dictionary(texts_clean)
dictionary.filter_extremes(no_above=0.5)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in texts_clean]

#Words in the dictionary sorted by the number of time it appeared in the doc
sorted_dict = sorted(dictionary.token2id.items(), key=lambda x: x[1], reverse = True)
sorted_dict

[(u'heaviside', 1105),
 (u'bus', 1104),
 (u'requires', 1103),
 (u'2009', 1102),
 (u'serious', 1101),
 (u'crosssectional', 1100),
 (u'time', 1099),
 (u'2007', 1098),
 (u'u', 1097),
 (u'2001', 1096),
 (u'2000', 1095),
 (u'2003', 1094),
 (u'principal', 1093),
 (u'comparison', 1092),
 (u'persistent', 1091),
 (u'interval', 1090),
 (u'odds', 1089),
 (u'established', 1088),
 (u'home', 1087),
 (u'tested', 1086),
 (u'prevalent', 1085),
 (u'longer', 1084),
 (u'e', 1083),
 (u'correlated', 1082),
 (u'younger', 1081),
 (u'building', 1080),
 (u'source', 1079),
 (u'structure', 1078),
 (u'crisishttpwwwapaorgedschoolscpseactivitiesviolenceagainstaspx\xcagoogle',
  1077),
 (u'discharge', 1076),
 (u'variable', 1075),
 (u'rural', 1074),
 (u'infection', 1073),
 (u'intent', 1072),
 (u'important', 1071),
 (u'includes', 1070),
 (u'behavioral', 1069),
 (u'crime', 1068),
 (u'developmental', 1067),
 (u'category', 1066),
 (u'poor', 1065),
 (u'presenting', 1064),
 (u'trajectory', 1063),
 (u'assaultinjured', 1062),

In [17]:
#tfidf using gensim package
tfidf = gensim.models.tfidfmodel.TfidfModel(doc_term_matrix, id2word = dictionary)
corpus_tfidf = tfidf[doc_term_matrix]
word_score = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc}
sorted_word_score = sorted(word_score.items(), key=operator.itemgetter(1),reverse=True)
sorted_word_score

[(u'jama', 1.0),
 (u'1991', 1.0),
 (u'1993', 1.0),
 (u'1994', 1.0),
 (u'1999', 1.0),
 (u'fleming', 1.0),
 (u'de', 1.0),
 (u'rimpela\u0308', 1.0),
 (u'httpdxdoiorg10', 1.0),
 (u'13', 1.0),
 (u'httpdxdoiorg', 1.0),
 (u'williams', 1.0),
 (u'tel', 1.0),
 (u'24', 1.0),
 (u'23', 1.0),
 (u'ym', 1.0),
 (u'wilson', 1.0),
 (u'kp', 1.0),
 (u'copyright', 1.0),
 (u'pk', 1.0),
 (u'httpdxdoi', 1.0),
 (u'2014b', 1.0),
 (u'2014c', 1.0),
 (u'2002', 1.0),
 (u'2001', 1.0),
 (u'2008', 1.0),
 (u'2009', 1.0),
 (u'33', 1.0),
 (u'34', 1.0),
 (u'1', 1.0),
 (u'2012', 1.0),
 (u'university', 1.0),
 (u'additionalinformationfromnonmetaanalyticsystematicreviews', 1.0),
 (u'metaanalyticresults', 1.0),
 (u'2016', 1.0),
 (u'\xfe', 1.0),
 (u'111', 1.0),
 (u'1985', 1.0),
 (u'\u204e', 1.0),
 (u'tokunaga', 0.893636004391661),
 (u'aggressive', 0.8790149801514608),
 (u'conway', 0.8751134434515477),
 (u'depend', 0.8740917543872065),
 (u'plo', 0.8698965837368409),
 (u'terrymcelrath', 0.8563198065901945),
 (u'offender', 0.855544

In [60]:
#tfidf using sklearn package
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 7, stop_words = stop)
tfidf_matrix =  tf.fit_transform(texts)
feature_names = tf.get_feature_names() 
len(feature_names)

837

In [61]:
#sort term scores
dense = tfidf_matrix.todense()
sentences = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(sentences)), sentences) if pair[1] > 0]
sorted(phrase_scores, key=lambda t: t[1] * -1)

[(451, 0.44263104909080564),
 (197, 0.4319069651654031),
 (600, 0.36840983172136454),
 (409, 0.34319435141885574),
 (66, 0.31754316787945297),
 (309, 0.27123550060173407),
 (401, 0.2683638563385173),
 (106, 0.25220690543419244),
 (659, 0.23237064805273297)]

In [57]:
#Match term ID with term
kwords_dict_3 = {}
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores]:
    kwords_dict_3[phrase] = score
kwords_dict_3

{u'28': 0.30023386300964267,
 u'conducted': 0.32006675943167445,
 u'correlational': 0.3482100873349453,
 u'events': 0.2909772909829126,
 u'factors': 0.2694861360585226,
 u'related': 0.2624140983461781,
 u'school': 0.2688288494627324,
 u'schools': 0.1880146754311687,
 u'setting': 0.2804009665876108,
 u'study': 0.4414027968618256,
 u'vandalism': 0.2809542882025583}

In [None]:
dict_list = [kwords_dict_1, kwords_dict_2, kwords_dict_3, kwords_dict_4, kwords_dict_5]

In [None]:
cat_list = ['Bully', 'Fight_Assault', 'Sexual Harassment', 'Drug', 'Vandalism']
main_keys = dict(zip(cat_list, dict_list))

In [None]:
main_keys['Bully']

In [None]:
import json
json.dump(main_keys, open('main_keys', 'wb'))