# Introduction

In this NoteBook, we will clean the multiple data of temperature for each states, merge them into one master dataset

# Set up

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime
import math

#### In this path, we have 49 files need to be merged and cleaned, each of file contains the average temperature of a state over 1990-2021

In [2]:
# first create a dict to store the state name, make it easier to create dataframe and read the file
State = {'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','Florida','Georgia',
             'Idaho','Illinois','Indiana','Iowa','Kansa','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan',
             'Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York',
             'North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
             'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'}

In [3]:
# create multiple dataframe, one data frame for each file, skip the title we don't need
df={}
for i in State:
    df[i] = pd.read_csv('../Raw_Data/' + i +'.csv',skiprows=[0,1,2,3], delimiter = ',')

In [10]:
df['Alabama'].head()

Unnamed: 0,Date,Value,Anomaly
0,199012,64.9,1.8
1,199112,64.0,0.9
2,199212,62.1,-1.0
3,199312,62.3,-0.8
4,199412,63.0,-0.1


In [11]:
# the display of time period was like: "1990/12 - 1991/12", so we make them to "1990" format
for i in State:
    df[i] = df[i].drop(['Anomaly'],axis=1)
    for index, row in df[i].iterrows():
        df[i].iloc[index,0] = math.floor(df[i].iloc[index,0]/100)

In [12]:
# pick 'Alabama' as example:
df['Alabama'].head()

Unnamed: 0,Date,Value
0,1990,64.9
1,1991,64.0
2,1992,62.1
3,1993,62.3
4,1994,63.0


### Now we have 49 dataframes, we need to combined them

In [13]:
# create a new dataframe to store, each row corresponding to a year
new_df = pd.DataFrame(columns=State)
index = []
for year in range(1990,2022):
    index.append(year)
new_df.insert(loc=0,column='Year',value=index)

In [14]:
# now all value in the data frame are NaN, we need to fill in
new_df.head()

Unnamed: 0,Year,Florida,Pennsylvania,New Hampshire,Texas,Louisiana,Minnesota,South Carolina,Washington,New Jersey,...,Georgia,Kentucky,South Dakota,Massachusetts,Vermont,Missouri,Nebraska,Rhode Island,Montana,Wyoming
0,1990,,,,,,,,,,...,,,,,,,,,,
1,1991,,,,,,,,,,...,,,,,,,,,,
2,1992,,,,,,,,,,...,,,,,,,,,,
3,1993,,,,,,,,,,...,,,,,,,,,,
4,1994,,,,,,,,,,...,,,,,,,,,,


In [15]:
# put the value into new dataframe
for state in State:
    new_df[state] = df[state]['Value']

In [16]:
new_df.head()

Unnamed: 0,Year,Florida,Pennsylvania,New Hampshire,Texas,Louisiana,Minnesota,South Carolina,Washington,New Jersey,...,Georgia,Kentucky,South Dakota,Massachusetts,Vermont,Missouri,Nebraska,Rhode Island,Montana,Wyoming
0,1990,72.5,50.7,45.0,65.6,67.8,42.5,65.0,47.3,54.4,...,65.7,57.3,46.9,49.7,44.0,56.2,50.0,51.5,42.8,41.8
1,1991,71.8,50.7,44.5,64.8,66.9,41.7,63.8,47.1,54.4,...,64.5,57.4,46.8,49.6,43.5,56.1,50.1,51.4,42.9,41.7
2,1992,70.3,47.6,41.8,64.4,65.8,40.9,61.9,48.6,51.2,...,62.7,55.1,45.6,46.4,40.6,54.5,48.9,48.4,43.5,42.5
3,1993,70.3,48.2,42.5,64.1,65.6,38.9,62.6,45.6,52.2,...,63.2,55.0,42.3,47.3,41.2,53.0,46.0,49.3,39.6,38.9
4,1994,71.6,48.0,42.7,65.6,66.5,40.9,62.8,47.9,52.2,...,63.6,55.4,44.9,47.7,41.3,54.7,49.2,49.6,42.7,42.9


In [17]:
# Now we have cleanned data, save it into processed file
path = '../Processed_Data/climate_state.csv'
new_df.to_csv(path)