# Introduction

In this NoteBook, we will clean the multiple data of temperature for each states, merge them into one master dataset

# Set up

In [23]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime
import math

#### In this path, we have 49 files need to be merged and cleaned, each of file contains the average temperature of a state over 1990-2021

In [24]:
# first create a dict to store the state name, make it easier to create dataframe and read the file
State = {'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','Florida','Georgia',
             'Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan',
             'Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York',
             'North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
             'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'}

In [25]:
# create multiple dataframe, one data frame for each file, skip the title we don't need
df={}
for i in State:
    df[i] = pd.read_csv('../Raw_Data/Climate/' + i +'.csv',skiprows=[0,1,2,3], delimiter = ',')

In [26]:
df['Alabama'].head()

Unnamed: 0,Date,Value,Anomaly
0,199012,64.9,1.8
1,199112,64.0,0.9
2,199212,62.1,-1.0
3,199312,62.3,-0.8
4,199412,63.0,-0.1


In [27]:
# the display of time period was like: "1990/12 - 1991/12", so we make them to "1990" format
for i in State:
    df[i] = df[i].drop(['Anomaly'],axis=1)
    for index, row in df[i].iterrows():
        df[i].iloc[index,0] = math.floor(df[i].iloc[index,0]/100)

In [28]:
# pick 'Alabama' as example:
df['Alabama'].head()

Unnamed: 0,Date,Value
0,1990,64.9
1,1991,64.0
2,1992,62.1
3,1993,62.3
4,1994,63.0


### Now we have 49 dataframes, we need to combined them

In [29]:
# create a new dataframe to store, each row corresponding to a year
new_df = pd.DataFrame(columns=State)
index = []
for year in range(1990,2022):
    index.append(year)
new_df.insert(loc=0,column='Year',value=index)

In [30]:
# now all value in the data frame are NaN, we need to fill in
new_df.head()

Unnamed: 0,Year,Massachusetts,North Carolina,Arkansas,North Dakota,Kansas,New Hampshire,New Jersey,Missouri,Wyoming,...,Vermont,Kentucky,Oregon,Montana,Texas,Colorado,Connecticut,Nebraska,Alabama,New York
0,1990,,,,,,,,,,...,,,,,,,,,,
1,1991,,,,,,,,,,...,,,,,,,,,,
2,1992,,,,,,,,,,...,,,,,,,,,,
3,1993,,,,,,,,,,...,,,,,,,,,,
4,1994,,,,,,,,,,...,,,,,,,,,,


In [31]:
# put the value into new dataframe
for state in State:
    new_df[state] = df[state]['Value']

In [32]:
new_df.head()

Unnamed: 0,Year,Massachusetts,North Carolina,Arkansas,North Dakota,Kansas,New Hampshire,New Jersey,Missouri,Wyoming,...,Vermont,Kentucky,Oregon,Montana,Texas,Colorado,Connecticut,Nebraska,Alabama,New York
0,1990,49.7,61.1,61.8,42.6,55.5,45.0,54.4,56.2,41.8,...,44.0,57.3,47.4,42.8,65.6,45.6,51.1,50.0,64.9,47.3
1,1991,49.6,60.2,61.3,42.3,55.7,44.5,54.4,56.1,41.7,...,43.5,57.4,47.6,42.9,64.8,45.0,51.0,50.1,64.0,47.1
2,1992,46.4,58.2,59.9,41.2,54.2,41.8,51.2,54.5,42.5,...,40.6,55.1,49.4,43.5,64.4,45.3,47.7,48.9,62.1,43.8
3,1993,47.3,58.7,59.3,38.3,51.9,42.5,52.2,53.0,38.9,...,41.2,55.0,45.5,39.6,64.1,43.8,48.6,46.0,62.3,44.2
4,1994,47.7,59.0,60.4,40.0,54.9,42.7,52.2,54.7,42.9,...,41.3,55.4,47.8,42.7,65.6,46.5,48.9,49.2,63.0,44.3


Swap row & column

In [59]:
df_trans = new_df.set_index('Year').T
df_trans.head()

Year,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Massachusetts,49.7,49.6,46.4,47.3,47.7,47.8,47.3,47.4,50.4,49.8,...,51.3,48.8,47.7,48.5,50.3,49.6,49.5,48.5,50.9,50.7
North Carolina,61.1,60.2,58.2,58.7,59.0,58.4,57.8,58.1,60.8,59.7,...,60.7,58.6,58.4,60.5,60.8,61.0,60.4,61.3,60.8,60.2
Arkansas,61.8,61.3,59.9,59.3,60.4,60.5,59.6,59.7,63.1,62.1,...,63.6,59.6,58.7,61.4,62.8,62.5,61.2,61.2,61.1,61.2
North Dakota,42.6,42.3,41.2,38.3,40.0,39.6,36.5,40.7,43.2,43.0,...,44.1,38.8,39.0,43.4,44.4,42.0,39.8,38.0,42.3,43.8
Kansas,55.5,55.7,54.2,51.9,54.9,53.9,53.1,53.7,56.3,56.0,...,58.2,53.7,53.7,56.3,57.2,56.5,54.6,53.8,55.6,56.1


Convert column name into string

In [60]:
df_trans.columns = df_trans.columns.astype(str)

Change Row and Column index name

In [65]:
df_trans.index.names = ['State']
df_trans.columns.names = ['']
df_trans.head()

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Massachusetts,49.7,49.6,46.4,47.3,47.7,47.8,47.3,47.4,50.4,49.8,...,51.3,48.8,47.7,48.5,50.3,49.6,49.5,48.5,50.9,50.7
North Carolina,61.1,60.2,58.2,58.7,59.0,58.4,57.8,58.1,60.8,59.7,...,60.7,58.6,58.4,60.5,60.8,61.0,60.4,61.3,60.8,60.2
Arkansas,61.8,61.3,59.9,59.3,60.4,60.5,59.6,59.7,63.1,62.1,...,63.6,59.6,58.7,61.4,62.8,62.5,61.2,61.2,61.1,61.2
North Dakota,42.6,42.3,41.2,38.3,40.0,39.6,36.5,40.7,43.2,43.0,...,44.1,38.8,39.0,43.4,44.4,42.0,39.8,38.0,42.3,43.8
Kansas,55.5,55.7,54.2,51.9,54.9,53.9,53.1,53.7,56.3,56.0,...,58.2,53.7,53.7,56.3,57.2,56.5,54.6,53.8,55.6,56.1


## Now we have cleanned data, save it into processed file

In [67]:
path = '../Processed_Data/climate_state.csv'
df_trans.to_csv(path)