<a href="https://colab.research.google.com/github/Leo-Kasper/is310-final-project-leo-kasper/blob/main/data_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk import FreqDist
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import warnings


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ZoeLeBlanc/visualizing_humanities_data_workshop/master/web_scraped_humanist_listserv.csv")

In [3]:
!pip3 install nltk



In [4]:
df = df.iloc[:21]

In [5]:
df

Unnamed: 0,dates,text
0,1987-1988,From: MCCARTY@UTOREPAS\nSubject: \nDate: 12 Ma...
1,1988-1989,From: Sebastian Rahtz \nSubject: C++ and Gnu o...
2,1989-1990,From: Willard McCarty \nSubject: Happy Birthda...
3,1990-1991,From: Elaine Brennan & Allen Renear \nSubject:...
4,1991-1992,From: Elaine Brennan & Allen Renear \nSubject:...
5,1992-1993,From: Elaine M Brennan \nSubject: Humanist's B...
6,1993-1994,From: 6500card%ucsbuxa@hub.ucsb.edu (Cheryl A....
7,1994-1995,From: Andrew Burday \nSubject: Re: 7.0638 Qs: ...
8,1995-1996,"From: ""Gregory Bloomquist"" \nSubject: Round Ta..."
9,1996-1997,From: Humanist \nSubject: Humanist begins its ...


In [6]:
df['start_yr'] = df['dates'].str.split('-').str[0]
df['end_yr'] = df['dates'].str.split('-').str[1]
df['vol_size'] = df['text'].str.count('\n')

In [7]:
df

Unnamed: 0,dates,text,start_yr,end_yr,vol_size
0,1987-1988,From: MCCARTY@UTOREPAS\nSubject: \nDate: 12 Ma...,1987,1988,50794
1,1988-1989,From: Sebastian Rahtz \nSubject: C++ and Gnu o...,1988,1989,36109
2,1989-1990,From: Willard McCarty \nSubject: Happy Birthda...,1989,1990,108695
3,1990-1991,From: Elaine Brennan & Allen Renear \nSubject:...,1990,1991,118929
4,1991-1992,From: Elaine Brennan & Allen Renear \nSubject:...,1991,1992,81193
5,1992-1993,From: Elaine M Brennan \nSubject: Humanist's B...,1992,1993,59957
6,1993-1994,From: 6500card%ucsbuxa@hub.ucsb.edu (Cheryl A....,1993,1994,71763
7,1994-1995,From: Andrew Burday \nSubject: Re: 7.0638 Qs: ...,1994,1995,46569
8,1995-1996,"From: ""Gregory Bloomquist"" \nSubject: Round Ta...",1995,1996,61076
9,1996-1997,From: Humanist \nSubject: Humanist begins its ...,1996,1997,52506


In [8]:
def stem_words(row):
    stemmed_words = ''
    for token in row.text.split(' '):
        stemmed_words += porter.stem(token) + ' '
    return stemmed_words

In [9]:
df['stemmed_text'] = df.apply(stem_words,axis=1)

In [10]:
df

Unnamed: 0,dates,text,start_yr,end_yr,vol_size,stemmed_text
0,1987-1988,From: MCCARTY@UTOREPAS\nSubject: \nDate: 12 Ma...,1987,1988,50794,from: mccarty@utorepas\nsubject: \ndate: 12 ma...
1,1988-1989,From: Sebastian Rahtz \nSubject: C++ and Gnu o...,1988,1989,36109,from: sebastian rahtz \nsubject: c++ and gnu o...
2,1989-1990,From: Willard McCarty \nSubject: Happy Birthda...,1989,1990,108695,from: willard mccarti \nsubject: happi birthda...
3,1990-1991,From: Elaine Brennan & Allen Renear \nSubject:...,1990,1991,118929,from: elain brennan & allen renear \nsubject: ...
4,1991-1992,From: Elaine Brennan & Allen Renear \nSubject:...,1991,1992,81193,from: elain brennan & allen renear \nsubject: ...
5,1992-1993,From: Elaine M Brennan \nSubject: Humanist's B...,1992,1993,59957,from: elain M brennan \nsubject: humanist' bir...
6,1993-1994,From: 6500card%ucsbuxa@hub.ucsb.edu (Cheryl A....,1993,1994,71763,from: 6500card%ucsbuxa@hub.ucsb.edu (cheryl A....
7,1994-1995,From: Andrew Burday \nSubject: Re: 7.0638 Qs: ...,1994,1995,46569,from: andrew burday \nsubject: re: 7.0638 qs: ...
8,1995-1996,"From: ""Gregory Bloomquist"" \nSubject: Round Ta...",1995,1996,61076,"from: ""gregori bloomquist"" \nsubject: round ta..."
9,1996-1997,From: Humanist \nSubject: Humanist begins its ...,1996,1997,52506,from: humanist \nsubject: humanist begin it 10...


In [11]:
documents = df.text.tolist()
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=1,stop_words=stop_words)

In [13]:
transformed_documents = vectorizer.fit_transform(documents)

transformed_documents_as_array = transformed_documents.toarray()

dates = df.dates.tolist()
tfidf_results = []
for counter, doc in enumerate(transformed_documents_as_array):
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['dates'] = dates[counter]
    tfidf_results.append(one_doc_as_df)



In [14]:
warnings.filterwarnings("ignore")

In [15]:
tfidf_df = pd.concat(tfidf_results)
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False)
tfidf_df.head()

Unnamed: 0,term,score,dates
0,ninch,0.841249,2002-2003
0,ninch,0.822897,2000-2001
0,ninch,0.790522,2001-2002
0,utorepas,0.766377,1987-1988
0,ninch,0.67189,1999-2000


In [16]:
tfidf_df.term.unique()
tfidf_df['start_yr'] = tfidf_df['dates'].str.split('-').str[0].astype(int)
tfidf_df['end_yr'] = tfidf_df['dates'].str.split('-').str[1].astype(int)
tfidf_df

Unnamed: 0,term,score,dates,start_yr,end_yr
0,ninch,0.841249,2002-2003,2002,2003
0,ninch,0.822897,2000-2001,2000,2001
0,ninch,0.790522,2001-2002,2001,2002
0,utorepas,0.766377,1987-1988,1987,1988
0,ninch,0.671890,1999-2000,1999,2000
...,...,...,...,...,...
65250,saab,0.000000,1994-1995,1994,1995
65251,s782293,0.000000,1994-1995,1994,1995
65252,s6,0.000000,1994-1995,1994,1995
65253,s6b,0.000000,1994-1995,1994,1995


In [17]:
#Top 10 unique words for all data
top10_all = tfidf_df.drop_duplicates(subset = 'term', keep = 'first').head(10)
top10_all

Unnamed: 0,term,score,dates,start_yr,end_yr
0,ninch,0.841249,2002-2003,2002,2003
0,utorepas,0.766377,1987-1988,1987,1988
1,amico,0.44418,1999-2000,1999,2000
0,fqs,0.426306,2005-2006,2005,2006
0,wmccarty,0.384587,2007-2008,2007,2008
1,fludd,0.337311,2007-2008,2007,2008
0,google,0.33312,2004-2005,2004,2005
1,elra,0.321378,1997-1998,1997,1998
1,wikipedia,0.316421,2005-2006,2005,2006
0,7848,0.30138,2003-2004,2003,2004


In [25]:
import altair as alt
alt.renderers.enable('default')
alt.data_transformers.enable('default', max_rows=None)

DataTransformerRegistry.enable('default')

In [28]:
#Part 1

alt.Chart(top10_all).mark_bar().encode(
    alt.X('term'),
    alt.Y('score'),
    color=alt.Color('term', scale=alt.Scale(scheme='accent'), sort=alt.Sort()),
)

In [30]:
#Part 2


alt.Chart(tfidf_df).mark_bar().encode(
    x='dates',
    y='score',
    color=alt.Color('term', scale=alt.Scale(scheme='accent'), sort=alt.Sort()),
)

KeyboardInterrupt: ignored

In [32]:
alt.Chart(top10_all).mark_bar().encode(
    x='dates',
    y='score',
    color=alt.Color('term', scale=alt.Scale(scheme='accent'), sort=alt.Sort()),
)