# Exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
def wrangle_grades():
    df = pd.read_csv('./student_grades.csv')
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = df.dropna()
    df = df.astype('int')
    return df

In [3]:
df = wrangle_grades()

FileNotFoundError: [Errno 2] No such file or directory: './student_grades.csv'

In [None]:
train, test = train_test_split(df, random_state=123, train_size=.8)

## Why Explore?

- What is the purpose of this pipeline stage?
- Why do we call it out explicitly?
- DIKW
- DIKIA (https://ds.codeup.com/fundamentals/DataToAction_v2.jpg)

In [None]:
df.head()

## Questions

- Is student_id unique?
- What's the relationship between individual exam scores and final grade? Probably fairly predictive
- Is there a cutoff in grade that makes sense to investigate? Passing/failing, letter grades?

In [None]:
df.student_id.value_counts().value_counts()

In [None]:
# Does each value uniquely identify a row?
df.student_id.nunique() == df.shape[0]

In [None]:
df = df.drop(columns='student_id')

In [None]:
correlation_table = df.corr()
# sns.heatmap(correlation_table, cmap='Blues', annot=True, vmin=0, vmax=1)
sns.heatmap(correlation_table, cmap='Blues', annot=True)

* Exam 1 seems to be the most predictive of final grade
* Everything seems to correlate with everything (Multicolinearity)

In [None]:
sns.pairplot(train.drop(columns='student_id'))

* Lots of linear relationships between exams
* Exam scores aren't all normally distributed
* It looks like the distribution of exam 2 scores is bimodal -- there's two peaks in the distribution

In [None]:
train.exam2.plot.hist()

In [None]:
train.exam2.value_counts(bins=[0, 75, 100])

In [None]:
train['exam2_desc'] = np.where(train.exam2 > 75, 'good grade', 'bad grade')

In [None]:
sns.histplot(data=train, x='final_grade', hue='exam2_desc')

In [None]:
train

In [None]:
sns.pairplot(train.drop(columns=['student_id', 'exam2']), hue='exam2_desc')

In [None]:
train.exam1.plot.hist()

In [None]:
train.exam1.value_counts(bins=[0, 75, 100])

In [None]:
train['exam1_desc'] = np.where(train.exam1 > 75, 'good grade', 'bad grade')

In [None]:
train.groupby('exam1_desc').final_grade.mean()

Takeaways:

* Students that perform well on exam 1 (> a 75) go on to have a average of 90 for their final grade
* Students that perform poorly on exam 1 (<= a 75) go on to, on average, just barely pass the class

In [None]:
# How many people fail?
(train.final_grade < 70).mean()

In [None]:
train['fails_class'] = train.final_grade < 70

In [None]:
pd.crosstab(train.exam1_desc, train.fails_class)

Nobody who got a good grade on exam 1 went on to fail the class.

In [None]:
pd.crosstab(train.exam1_desc, train.fails_class, normalize='index')

* Almost a third (32.5%) of students who get a bad grade (<= 75) on the first exam end up failing the class.

This suggests we can intervene in students likely to fail by taking a look at the score on the first exam.