## Find important words in research articles about school violence

In [19]:
import pandas as pd
import nltk
import numpy as np
%matplotlib inline  
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
from operator import itemgetter

In [121]:
#Take one txt file as an example
example_file = open('/Users/Esther/Lantern_v2/Data/fight_assault_data/Adolescent Assault Injury- Risk and Protective Factors and Locations of Contact for Intervention.txt', encoding = 'ISO-8859-1')

In [122]:
#Parse by sentences
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = example_file.read()
docs = tokenizer.tokenize(data)

In [123]:
#Clean the input
stop = set(stopwords.words('english'))
stop.update(["this","these","many","may",'we','in','of','to','And','It',"It's","The",'the','abstract','background','them', '1','2','3','4','5','6','7','8','9'])
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i.lower() for i in str(doc).split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [124]:
#build dictionary and term matrix
doc_clean = [clean(doc).split() for doc in docs]
dictionary = corpora.Dictionary(doc_clean)
dictionary.filter_extremes(no_above=0.1)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

#Words in the dictionary sorted by the number of time it appeared in the doc
sorted_dict = sorted(dictionary.token2id.items(), key=lambda x: x[1], reverse = True)
sorted_dict

[('screening', 102),
 ('our', 101),
 ('lifetime', 100),
 ('history', 99),
 ('home', 98),
 ('of', 97),
 ('data', 96),
 ('used', 95),
 ('behavioral', 94),
 ('including', 93),
 ('community', 92),
 ('social', 91),
 ('using', 90),
 ('question', 89),
 ('audiotape', 88),
 ('in', 87),
 ('consent', 86),
 ('analysis', 85),
 ('time', 84),
 ('uninjured', 83),
 ('possible', 82),
 ('found', 81),
 ('research', 80),
 ('patient', 79),
 ('table', 78),
 ('hospitalized', 77),
 ('we', 76),
 ('sexual', 75),
 ('included', 74),
 ('hospital', 73),
 ('population', 72),
 ('primary', 71),
 ('predictive', 70),
 ('comparison', 69),
 ('problem', 68),
 ('individual', 67),
 ('association', 66),
 ('exposure', 65),
 ('use', 64),
 ('these', 63),
 ('behavior', 62),
 ('high', 61),
 ('potential', 60),
 ('ed', 59),
 ('for', 58),
 ('number', 57),
 ('include', 56),
 ('this', 55),
 ('however', 54),
 ('assessment', 53),
 ('highrisk', 52),
 ('fighting', 51),
 ('weaponcarrying', 50),
 ('access', 49),
 ('school', 48),
 ('involvemen

In [125]:
#tfidf using gensim package
tfidf = gensim.models.tfidfmodel.TfidfModel(doc_term_matrix, id2word = dictionary)

In [126]:
#Fit corpus to get scores for terms in dictionary
corpus_tfidf = tfidf[doc_term_matrix]
word_score = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc}
#print(word_score)

In [127]:
#Sort by term scores
import operator
sorted_word_score = sorted(word_score.items(), key=operator.itemgetter(1),reverse=True)
sorted_word_score

[('in', 0.7876250651812999),
 ('of', 0.7876250651812999),
 ('age', 0.7769293634413851),
 ('hospitalized', 0.768956907018393),
 ('behavior', 0.6827990356238554),
 ('question', 0.6591676699681592),
 ('interview', 0.6383459899771748),
 ('high', 0.619745151812437),
 ('population', 0.6130089543656961),
 ('atrisk', 0.6081270403165475),
 ('lifetime', 0.6055986595372576),
 ('involvement', 0.6032694783408777),
 ('to', 0.5951937190408129),
 ('predictive', 0.5951937190408129),
 ('conducted', 0.5870329427340855),
 ('result', 0.5870329427340855),
 ('school', 0.5870080464068823),
 ('exposure', 0.5712786894992368),
 ('use', 0.5592533523746622),
 ('potential', 0.5448094811442676),
 ('individual', 0.5444900551837686),
 ('prevention', 0.5398970954068303),
 ('include', 0.5365575079396265),
 ('matched', 0.5334255436892335),
 ('consent', 0.527714856455989),
 ('among', 0.5213860880967386),
 ('weaponcarrying', 0.5213860880967386),
 ('these', 0.5137843667023391),
 ('used', 0.49820766181522225),
 ('95', 0.4952

In [97]:
#tfidf using sklearn package
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stop)
tfidf_matrix =  tf.fit_transform(docs)
feature_names = tf.get_feature_names() 
len(feature_names)

47730

In [49]:
#print(feature_names)

In [98]:
#sort term scores
dense = tfidf_matrix.todense()
sentences = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(sentences)), sentences) if pair[1] > 0]
#sorted(phrase_scores, key=lambda t: t[1] * -1)

[(23489, 0.26529856946874675),
 (6127, 0.25146900823927426),
 (11607, 0.25146900823927426),
 (17580, 0.25146900823927426),
 (17581, 0.25146900823927426),
 (23487, 0.25146900823927426),
 (25842, 0.25146900823927426),
 (33366, 0.25146900823927426),
 (3363, 0.23404582163163482),
 (3361, 0.22256946200088382),
 (25840, 0.22256946200088382),
 (25839, 0.20145533783852984),
 (11598, 0.19657446932594275),
 (36545, 0.18199525385953136),
 (36543, 0.1805436350106156),
 (6110, 0.1752858798484447),
 (33349, 0.17181428642422203),
 (33344, 0.1676749230875523),
 (23960, 0.15619856345680128),
 (3348, 0.14452390155383932),
 (17425, 0.12344782301142943),
 (23314, 0.12214084722112573),
 (6054, 0.11478730975565171),
 (36350, 0.10575920397671404)]

In [99]:
#Match term ID with term
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
    print('{0: <20} {1}'.format(phrase, score))

injury risk protective 0.26529856946874675
assault injury risk  0.25146900823927426
contact intervention 0.25146900823927426
factors locations    0.25146900823927426
factors locations contact 0.25146900823927426
injury risk          0.25146900823927426
locations contact intervention 0.25146900823927426
protective factors locations 0.25146900823927426
adolescent assault injury 0.23404582163163482
adolescent assault   0.22256946200088382
locations contact    0.22256946200088382
locations            0.20145533783852984
contact              0.19657446932594275
risk protective factors 0.18199525385953136
risk protective      0.1805436350106156
assault injury       0.1752858798484447
protective factors   0.17181428642422203
protective           0.1676749230875523
intervention         0.15619856345680128
adolescent           0.14452390155383932


## Combine files for each type of violence

In [129]:
#Fight_assault_data
import glob
path =r'/Users/Esther/Lantern_v2/Data/fight_assault_data/'
allFiles = glob.glob(path + "/*.txt")
text = ''
for file_ in allFiles:
    with open(file_, encoding="latin-1") as f:
        text = text + str(f.read())

In [115]:
texts = tokenizer.tokenize(text)
texts_clean = [clean(doc).split() for doc in texts]
dictionary = corpora.Dictionary(texts_clean)
dictionary.filter_extremes(no_above=0.5)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in texts_clean]

#Words in the dictionary sorted by the number of time it appeared in the doc
sorted_dict = sorted(dictionary.token2id.items(), key=lambda x: x[1], reverse = True)
sorted_dict

[('team', 1105),
 ('perone', 1104),
 ('hyman', 1103),
 ('dwyer', 1102),
 ('leone', 1101),
 ('mayer', 1100),
 ('soriano', 1099),
 ('harmful', 1098),
 ('hoover', 1097),
 ('craig', 1096),
 ('cultural', 1095),
 ('oak', 1094),
 ('oliver', 1093),
 ('brockenbrough', 1092),
 ('bulach', 1091),
 ('skiba', 1090),
 ('studer', 1089),
 ('eisenbraun', 1088),
 ('kd', 1087),
 ('morrison', 1086),
 ('furlong', 1085),
 ('osofsky', 1084),
 ('cited', 1083),
 ('heaviside', 1082),
 ('459ð469', 1081),
 ('2007', 1080),
 ('kid', 1079),
 ('economic', 1078),
 ('human', 1077),
 ('combined', 1076),
 ('hbsc', 1075),
 ('crossnational', 1074),
 ('prevalent', 1073),
 ('canadian', 1072),
 ('studentsõ', 1071),
 ('canada', 1070),
 ('think', 1069),
 ('fear', 1068),
 ('america', 1067),
 ('retaliate', 1066),
 ('ivr', 1065),
 ('mediating', 1064),
 ('spsir', 1063),
 ('dysfunctional', 1062),
 ('solve', 1061),
 ('nezu', 1060),
 ('dõzurilla', 1059),
 ('view', 1058),
 ('instrumental', 1057),
 ('aq', 1056),
 ('multidimensional', 105

In [102]:
#tfidf using gensim package
tfidf = gensim.models.tfidfmodel.TfidfModel(doc_term_matrix, id2word = dictionary)
corpus_tfidf = tfidf[doc_term_matrix]
word_score = {dictionary.get(id): value for doc in corpus_tfidf for id, value in doc}
sorted_word_score = sorted(word_score.items(), key=operator.itemgetter(1),reverse=True)
sorted_word_score

[('10', 1.0),
 ('address', 1.0),
 ('32', 1.0),
 ('11', 1.0),
 ('minnesota', 1.0),
 ('7', 1.0),
 ('6', 1.0),
 ('educ', 0.8838954499422123),
 ('received', 0.8776556790056448),
 ('stress', 0.8675894586585865),
 ('low', 0.7837963127721629),
 ('unfortunately', 0.7591064951827186),
 ('them', 0.7571788657915562),
 ('2009', 0.7539076550002983),
 ('lansing', 0.7148724499537761),
 ('distinct', 0.7144493013197016),
 ('degree', 0.6890778578365464),
 ('episode', 0.6707643767239545),
 ('tend', 0.6662376711322004),
 ('classroom', 0.652405113327182),
 ('pathway', 0.6522289505811617),
 ('multidimensional', 0.6518984475280244),
 ('duty', 0.6441516263528149),
 ('involvement', 0.6396724947999675),
 ('texas', 0.6202136500612716),
 ('membership', 0.6187044462441232),
 ('described', 0.6138971379254753),
 ('r', 0.6111870878688889),
 ('majority', 0.6072207324408523),
 ('considered', 0.6037040674065984),
 ('2011', 0.6024840366240158),
 ('trajectory', 0.5990412804105867),
 ('we', 0.5983087230258023),
 ('prevalen

In [136]:
#tfidf using sklearn package
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,5), min_df = 7, stop_words = stop)
tfidf_matrix =  tf.fit_transform(texts)
feature_names = tf.get_feature_names() 
len(feature_names)
#feature_names

1079

In [137]:
#sort term scores
dense = tfidf_matrix.todense()
sentences = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(sentences)), sentences) if pair[1] > 0]
#sorted(phrase_scores, key=lambda t: t[1] * -1)

[(569, 0.349048744487835),
 (250, 0.3405919766274855),
 (835, 0.3153315049578426),
 (834, 0.3128163780710334),
 (141, 0.3037066028827922),
 (762, 0.2976915955908614),
 (761, 0.2905195861958976),
 (525, 0.2706352338457243),
 (72, 0.2504072958656692),
 (392, 0.2138901261898197),
 (511, 0.21162561305466937),
 (137, 0.1988846102725775),
 (830, 0.18324219036426342)]

In [135]:
#Match term ID with term
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
    print('{0: <20} {1}'.format(phrase, score))

risk protective factors 0.5127876891759959
risk protective      0.5086976249610557
assault injury       0.49388343578476157
protective factors   0.48410191493730187
