In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### The pandas `.concat( )`  method is used to concatenate two dataframes based on shared column names
- we will create two dataframes and then concatenate them
- this would be useful if you had, for example, two different years of data with the same columns that you want to explore together; in a case like this, you would want to add a year column before joining them


In [None]:
data1 = { 'Col1' : [1,2,3],
         'Col2' : [4,5,6],
         'Col3' : [7,8,9]}
data2 = { 'Col1' : [4,5,6],
         'Col2' : [7,8,9],
         'Col3' : [1,2,3]}

In [None]:
df_2018 = pd.DataFrame(data1)
df_2019 = pd.DataFrame(data2)

In [None]:
print('df_2018')
print(df_2018)
print('=================')
print('df_2019')
print(df_2019)

#### Before we concatenate `df_2018` and `df_2019` we need to add a column for the year

In [None]:
df_2018['year'] = '2018'
df_2019['year'] = '2019'

In [None]:
df_2018

In [None]:
concat_data = pd.concat([df_2018, df_2019])
concat_data

## Notice that the index repeats itself

### The `reset_index( )` method will change these to a 0-based incrementing index
- add the `drop = True` argument to prevent saving the current index as a column


In [None]:
concat_data = concat_data.reset_index(drop = True)
print(concat_data)

### Another method for combining data is  `merge( )`
- First we'll read in the schools data again
- And we'll read in the school indicator scores


In [None]:
schools = pd.read_csv('../data/schools_clean.csv')
schools.head(2)

In [None]:
indicators = pd.read_csv('../data/school_indicator_scores_suppressed.csv')
indicators.head(2)

### Before we merge, we need to decide 
- which columns to keep from each dataframe  
- what _type_ of merge we want

#### Let's say we want to look at the effect of poverty on school indicators
- From the schools data, we need name, total number of students, and total who are economically disadvantaged
    - we need to create a column for the total number of students before we can subset the schools dataframe
- From the school indicators dataframe, we need school name, and the six *score* columns
    - filtering the dataframe first so that we only look at Davidson County schools is a good idea
    - we also want to filter in order to keep only the rows where subgroup is "All Students"


In [None]:
schools['total_students'] = schools.male + schools.female
schools.head(2)

In [None]:
schools_subset = schools[['name', 'total_students', 'econ_disadv']]

In [None]:
indicators = indicators.loc[(indicators.system_name == 'Davidson County') & (indicators.subgroup == 'All Students')]
indicators.shape

In [None]:
indicators.head(3)

In [None]:
indicators_subset = indicators[['school_name', 'score_achievement', 
                                'score_growth', 'score_absenteeism',
                                'score_grad', 'score_ready_grad', 
                                'score_elpa']]

In [None]:
print('schools subset: ', schools_subset.shape)
print('indicators subset: ', indicators_subset.shape)

### There are 12 more rows in the schools_subset data than there are in the indicators_subset
- We want data that matches to **both** datasets 
![pandas merge types](../images/pandas_merge_types.png)
- So we want an inner join (which is the default)

In [None]:
schools_with_scores = pd.merge(schools_subset, indicators_subset, 
                               left_on = 'name', right_on = 'school_name', 
                               how = 'inner')
schools_with_scores.head(3)

### We can drop one of the school name columns and calculate the percentage of students from each school who are economically disadvantaged.

In [None]:
schools_with_scores = schools_with_scores.drop(columns = 'school_name')

In [None]:
schools_with_scores['pct_econ_disadv'] = schools_with_scores.econ_disadv / schools_with_scores.total_students * 100
schools_with_scores.head()

### We'll take a quick peak at the relationship between the percentage of students who are economically disadvantaged and the achievement score  for schools and then save this dataset for further exploration

In [None]:
plt.scatter(x = 'score_achievement', y ='pct_econ_disadv', data = schools_with_scores)
plt.xlabel('score')
plt.ylabel('percentage economically disadvantaged');

In [None]:
schools_with_scores.to_csv('../data/schools_with_scores.csv', index = False)

In [None]:
clean = pd.read_csv('../data/schools_with_scores.csv')
clean.head()