In [None]:
# Programming-Poetry-Project
# Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats

# File Location
poetry_analysis = "Resources/Poetry_Analysis_Data.csv"

# Read Poetry Analysis Data File As A Pandas Dataframe
poetry_analysis_df = pd.read_csv(poetry_analysis)
poetry_analysis_df.head(5)

In [None]:
# Total Genre per Age – Stacked Column Chart (Bilal)
# Create a table with the Age and Genre Type
genre_per_age = poetry_analysis_df.groupby(['age', 'type'])['type'].count().unstack()
genre_per_age

In [None]:
# Create a Stacked Column Chart with the table created with groupby from above
genre_per_age.plot(kind='bar', stacked=True, rot=20, alpha=0.8)
plt.title("Total Genre per Age", fontsize=16)
plt.xlabel("Age", fontsize=12)
plt.ylabel("Count of Genres", fontsize=12)
plt.legend(loc="upper left", title = "Genre", fontsize=9.6)
plt.ylim(0, 385)

# Save the graph to the images folder
plt.tight_layout()
plt.savefig("Images/genre_per_age.png")
plt.show()

In [None]:
# Gender – Bar Graph (Bilal)

# Create a bar graph from the "Gender" column in the dataframe
poetry_analysis_df['Gender'].value_counts().plot(kind='bar', figsize=(7, 6), rot=20, color='b', alpha=0.6)
plt.xlabel("Gender", fontsize=12)
plt.ylabel("Count of People", fontsize=12)
plt.title("Gender_Bar Graph", fontsize=16)

# Save the graph to the images folder
plt.tight_layout()
plt.savefig("Images/gender.png")
plt.show()

row_count = print("Number of rows present:", 
      len(poetry_analysis_df['Gender']))
poetry_analysis_df['Gender'].value_counts()

### Age Distribution of Poetry

In [None]:
age_distribution = poetry_analysis_df["age"].value_counts()
plt.pie(age_distribution, autopct = "%1.1f%%", labels=age_distribution.index)
plt.legend()
plt.show()

### Length of Poems by Genre

In [None]:
#Used to split strings with multiple delimiters
import re

#Make a dictionary to contain poem name, word count, and list of words for each poem
poem_dict = {}
index = 0

#Iterates through each poem by name
for poem in poetry_analysis_df["poem name"]:
    
    #Temporary dictionary that stores poem name, word count, and split content
    word_count_dict = {}
    
    #splits the string in the "content" column based on multiple delimiters and removes empty strings
    word_list = re.split(',|\n|;| |:|\.', poetry_analysis_df["content"][index])
    word_list = [word for word in word_list if word != ""]
    
    word_count_dict["poem name"] = poem
    word_count_dict["word count"] = len(word_list)
    word_count_dict["split content"] = word_list
    
    #adds poem to the poem dictionary
    poem_dict[index] = word_count_dict
    index += 1

In [None]:
#Creates series to add to the dataframe
word_count_series = []
split_content_series = []

#Adds elements to each respective series
for key in poem_dict.keys():
    word_count_series.append(poem_dict[key]["word count"])
    split_content_series.append(poem_dict[key]["split content"])

#Adds new columns to the dataframe for word count and split content
poetry_analysis_df["word count"] = word_count_series
poetry_analysis_df["split content"] = split_content_series

In [None]:
# creates a new dataframe with only the columns of interest
word_count_df = poetry_analysis_df[["poem name", "content", "type", "word count", "split content"]]
word_count_df

In [None]:
#Create a dictionary with genre as keys and an array of corresponding poem lengths as values
genre_dict = {}
genres = []
for genre in word_count_df["type"]:
    if genre not in genres:
        genres.append(genre)
        
for genre in genres:
    genre_df = word_count_df[word_count_df["type"] == genre]
    genre_dict[genre] = genre_df["word count"]


#Plot the boxplots
fig, ax = plt.subplots(figsize = (20,4))
ax.boxplot(genre_dict.values(), flierprops={'marker': 'o', 'markerfacecolor': 'red'}, vert=False)
ax.set_yticklabels(genre_dict.keys())
ax.set_title("Boxplots of the Length of Poems by Genre")
ax.set_xlabel("Number of Words")
plt.show()

In [None]:
#Making boxplots of poems with max length of 600
genre_dict = {}
        
for genre in genres:
    genre_df = word_count_df[word_count_df["type"] == genre]
    genre_df = genre_df[genre_df["word count"] <= 600]
    genre_dict[genre] = genre_df["word count"]

fig, ax = plt.subplots(figsize = (20,4))
ax.boxplot(genre_dict.values(), flierprops={'marker': 'o', 'markerfacecolor': 'red'}, vert=False)
ax.set_yticklabels(genre_dict.keys())
ax.set_title("Boxplots of the Length of Poems by Genre (Limit 600 Words)")
ax.set_xlabel("Number of Words")
plt.show()

### N-Gram Frequencies (WIP)

In [None]:
#Dependencies for making N-grams
import nltk, re, string, collections
from nltk.util import ngrams

In [None]:
#Creates bigrams for each poem and stores them in bigrams_list. Frequency of bigrams are stored in bigrams_frequency_list.
bigrams_list = []
bigrams_frequency_list = []
for poem in split_content_series:
    poem_bigram = ngrams(poem, 1)
    bigram_frequency = collections.Counter(poem_bigram)
    bigrams_list.append(poem_bigram)
    bigrams_frequency_list.append(bigram_frequency)

In [None]:
#Print out the top 5 occuring bigrams for each poem:
#for bigram in bigrams_frequency_list:
    #print(bigram.most_common(5))