# Exploratory Data Analysis in Python
This notebook contains my notes on exploratory data analysis.



### EDA: Diagnosing Diabetes

In [None]:
import codecademylib3
import pandas as pd
import numpy as np

# code goes here
diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data.head())

# number of features
print(len(diabetes_data.columns))

# number of observations
print(len(diabetes_data))

# missing values?
print(diabetes_data.isnull().sum())

# info might tell us more...
print(diabetes_data.info())

# make sure there really aren't any null values...
print(diabetes_data.describe()) 
# we have '0's where there shouldn't be any.

# replace '0's with NaNs...
diabetes_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = diabetes_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

# check for nulls again...
print(diabetes_data.isnull().sum())
print(diabetes_data.info())

# let's look at the rows with missing data:
print(diabetes_data[diabetes_data.isnull().any(axis=1)])

# look at data types of each column
print(diabetes_data.dtypes)

# why is outcome an object?
print(diabetes_data.Outcome.unique())

# let's fix the typo...
diabetes_data[['Outcome']] = diabetes_data[['Outcome']].replace('O', '0')

# check Outcome again...
print(diabetes_data.Outcome.unique())

# looks better...

### Exploring Student Data


In [None]:
# Load libraries
import pandas as pd
import numpy as np
import codecademylib3
import matplotlib.pyplot as plt
import seaborn as sns

# Import data
students = pd.read_csv('students.csv')

# Print first few rows of data
print(students.head())

# Print summary statistics for all columns
print(students.describe(include='all'))
# Calculate mean
print(students.math_grade.mean())

# Calculate median
print(students.math_grade.median())

# Calculate mode
print(students.math_grade.mode()[0])

# Calculate range
print(students.math_grade.max() - students.math_grade.min())

# Calculate standard deviation
print(students.math_grade.std())

# Calculate MAD
print(students.math_grade.mad())

# Create a histogram of math grades
sns.histplot(x = 'math_grade', data = students)
plt.show()
plt.clf()

# Create a box plot of math grades
sns.boxplot(x = 'math_grade', data = students)
plt.show()
plt.clf()

# Calculate number of students with mothers in each job category
print(students.Mjob.value_counts())

# Calculate proportion of students with mothers in each job category
print(students.Mjob.value_counts(normalize=True))

# Create bar chart of Mjob
sns.countplot(x='Mjob', data=students)
plt.show()
plt.clf()

# Create pie chart of Mjob
students.Mjob.value_counts().plot.pie()
plt.show()

### Review

In [None]:
import codecademylib3
import pandas as pd

user_visits = pd.read_csv('page_visits.csv')

print(user_visits.head())

click_source = user_visits.groupby('utm_source').id.count().reset_index()

print(click_source)

click_source_by_month = user_visits.groupby(['utm_source', 'month']).id.count().reset_index()

click_source_by_month_pivot = click_source_by_month.pivot(
  columns = 'month',
  index = 'utm_source',
  values = 'id').reset_index()

print(click_source_by_month_pivot)

### A/B Testing for ShoeFly.com

In [None]:
import codecademylib3
import pandas as pd

ad_clicks = pd.read_csv('ad_clicks.csv')

print(ad_clicks.head())

print(ad_clicks.groupby('utm_source').user_id.count().reset_index())

ad_clicks['is_click'] = ~ad_clicks.ad_click_timestamp.isnull()
print(ad_clicks.head())

clicks_by_source = ad_clicks.groupby(['utm_source', 'is_click']).user_id.count().reset_index()
print(clicks_by_source)

clicks_pivot = clicks_by_source.pivot(index='utm_source',
  columns='is_click',
  values='user_id').reset_index()
print(clicks_pivot)

clicks_pivot['percent_clicked'] = clicks_pivot[True] / (clicks_pivot[True] + clicks_pivot[False])
print(clicks_pivot)

clicks_by_group = ad_clicks.groupby(['experimental_group']).user_id.count().reset_index()
print(clicks_by_group)

clicks_by_group_click = ad_clicks.groupby(['experimental_group', 'is_click']).user_id.count().reset_index()
print(clicks_by_group_click)

clicks_pivot_2 = clicks_by_group_click.pivot(index='is_click',
  columns='experimental_group',
  values='user_id').reset_index()
print(clicks_pivot_2)

a_clicks = ad_clicks[ad_clicks.experimental_group == 'A']
print(a_clicks)
b_clicks = ad_clicks[ad_clicks.experimental_group == 'B']
print(b_clicks)

clicks_by_a_clicks = a_clicks.groupby(['is_click', 'day']).user_id.count().reset_index()
print(clicks_by_a_clicks)

clicks_pivot_a = clicks_by_a_clicks.pivot(index='day',
  columns='is_click',
  values='user_id').reset_index()
print(clicks_pivot_a)

clicks_by_b_clicks = b_clicks.groupby(['is_click', 'day']).user_id.count().reset_index()
print(clicks_by_b_clicks)

clicks_pivot_b = clicks_by_b_clicks.pivot(index='day',
  columns='is_click',
  values='user_id').reset_index()
print(clicks_pivot_b)

percent_clicked_a = (len(clicks_pivot_a)) / len(ad_clicks)
print(percent_clicked_a)

percent_clicked_b = (len(clicks_pivot_b)) / len(ad_clicks)
print(percent_clicked_b)

## Inspect, Clean, and Validate a Dataset

## Summarizing a Single Feature

## Aggregates in Pandas

## Summarize the Relationship Between Two Features

## Advanced Data Visualization

## EDA for Machine Learning Models