In [None]:
# imports
import re
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from matplotlib.patches import ConnectionPatch

In [None]:
# read data from csv files
all_training_data = pd.read_csv('./data/train.csv', index_col=0)
all_vocab_words = pd.read_csv('./data/unigram_freq.csv', index_col=0)

In [None]:
all_essays = []
all_scores = []

for ind, data in enumerate(all_training_data.iterrows()):
    text, cohes, syntax, vocab, phrase, gram, convs = data[1]
    all_essays.append(text)
    all_scores.append(vocab) 

In [None]:
np_all_essays = np.array(all_essays)

In [None]:
# building dictionary with vocab word as key and count as value
vocab_dict = {}

for data in all_vocab_words.iterrows():
  vocab_dict[data[0]] = data[1][0]

In [None]:
cleaned_essays = []
misspelled_perc = []

for index, essay in enumerate(np_all_essays):
  essay_wo_punc = re.sub(r'[^\w\s]', '', essay)
  essay_lower = essay_wo_punc.lower()
  split_essay = re.split('[^a-zA-Z]+', essay_lower)
  cleaned_essays.append(split_essay)

  misspell_count = 0
  for word in split_essay:
    if word not in vocab_dict and word != '':
      misspell_count += 1
  misspelled_perc.append(misspell_count / len(split_essay))

In [None]:
num_essays_per_score = {}
sum_perc_per_score = {}

for index in range(len(all_scores)):
  if all_scores[index] not in num_essays_per_score:
    num_essays_per_score[all_scores[index]] = 0
    sum_perc_per_score[all_scores[index]] = 0
  
  num_essays_per_score[all_scores[index]] += 1
  sum_perc_per_score[all_scores[index]] += misspelled_perc[index]
  if all_scores[index] == 1.5:
    print(sum_perc_per_score[1.5])

scores = []
avg_perc_per_score = []

for sc in sum_perc_per_score:
  scores.append(sc)
  avg = sum_perc_per_score[sc] / num_essays_per_score[sc]
  avg_perc_per_score.append(avg)

In [None]:
# plotting misspelled words vs vocab scores

fig,ax = plt.subplots(figsize=(6, 6))
ax.scatter(all_scores, misspelled_perc, color=['blue'])
ax.scatter(scores, avg_perc_per_score, color=['orange'], marker='D')
ax.set_xlabel('True vocab score')
ax.set_ylabel('Percentage of misspelled words')

In [None]:
num_words = len(all_vocab_words)
top1 = num_words*0.01
top10 = num_words*0.1
top50 = num_words*0.5

top1_words = set()
top10_words = set()
top50_words = set()

index = 0
for data in all_vocab_words.iterrows():
    if index < top1:
        top1_words.add(data[0])
    elif index < top10:
        top10_words.add(data[0])
    elif index < top50:
        top50_words.add(data[0])
    else:
        break
    index += 1

In [None]:
top1_count = 0
top10_count = 0
top50_count = 0
other = 0

for essay in cleaned_essays:
  for word in essay:
    if word in top1_words:
      top1_count += 1
    elif word in top10_words:
      top10_count += 1
    elif word in top50_words:
      top50_count += 1
    else:
      other += 1

total = top1_count + top10_count + top50_count + other
bottom = top10_count + top50_count + other

# basic pie chart parametrs
pie1_labels = ['Top 1%', 'Top 10%', 'Top 50%', '']
pie1_percs = [top1_count/total, top10_count/total, top50_count/total, other/total]

# advanced pie chart parameters
pie2_labels = ['Other', 'Top 1%']
pie2_percs = [1 - (top1_count/total), top1_count/total]

# bar chart parameters
other_percs = [top10_count/bottom, top50_count/bottom, other/bottom]
other_labels = ['Top 10%', 'Top 50%', '']

In [None]:
fig2, ax2 = plt.subplots()
ax2.pie(pie1_percs, labels=pie1_labels, autopct='%1.1f%%', startangle=90)
ax2.axis('equal')

In [None]:
# based on code found at https://matplotlib.org/stable/gallery/pie_and_polar_charts/bar_of_pie.html

# make figure and assign axis objects
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 5))
fig.subplots_adjust(wspace=0)

# pie chart parameters
explode = [0.1, 0]

# rotate so that first wedge is split by the x-axis
angle = -180 * pie2_percs[0]
wedges, *_ = ax1.pie(pie2_percs, autopct='%1.1f%%', startangle=angle,
                     labels=pie2_labels, explode=explode)

# bar chart parameters
bottom = 1
width = .2

# Adding from the top matches the legend.
for j, (height, label) in enumerate(reversed([*zip(other_percs, other_labels)])):
    bottom -= height
    bc = ax2.bar(0, height, width, bottom=bottom, color='C0', label=label,
                 alpha=0.1 + 0.25 * j)
    ax2.bar_label(bc, labels=[f"{height:.0%}"], label_type='center')

ax2.set_title('Remaining breakdown')
ax2.legend()
ax2.axis('off')
ax2.set_xlim(- 2.5 * width, 2.5 * width)

# use ConnectionPatch to draw lines between the two plots
theta1, theta2 = wedges[0].theta1, wedges[0].theta2
center, r = wedges[0].center, wedges[0].r
bar_height = sum(other_percs)

# draw top connecting line
x = r * np.cos(np.pi / 180 * theta2) + center[0]
y = r * np.sin(np.pi / 180 * theta2) + center[1]
con = ConnectionPatch(xyA=(-width / 2, bar_height), coordsA=ax2.transData,
                      xyB=(x, y), coordsB=ax1.transData)
con.set_color([0, 0, 0])
con.set_linewidth(2)
ax2.add_artist(con)

# draw bottom connecting line
x = r * np.cos(np.pi / 180 * theta1) + center[0]
y = r * np.sin(np.pi / 180 * theta1) + center[1]
con = ConnectionPatch(xyA=(-width / 2, 0), coordsA=ax2.transData,
                      xyB=(x, y), coordsB=ax1.transData)
con.set_color([0, 0, 0])
ax2.add_artist(con)
con.set_linewidth(2)