In [2]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime
import math

#### In this path, we have 49 files need to be merged and cleaned, each of file contains the average temperature of a state over 1990-2021

In [3]:
# first create a dict to store the state name, make it easier to create dataframe and read the file
State = {'Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','Florida','Georgia',
             'Idaho','Illinois','Indiana','Iowa','Kansa','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan',
             'Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York',
             'North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas',
             'Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming'}

In [4]:
# create multiple dataframe, one data frame for each file, skip the title we don't need
df={}
for i in State:
    df[i] = pd.read_csv( i +'.csv',skiprows=[0,1,2,3], delimiter = ',')

In [5]:
df['Alabama'].head()

Unnamed: 0,Date,Value,Anomaly
0,199012,64.9,1.8
1,199112,64.0,0.9
2,199212,62.1,-1.0
3,199312,62.3,-0.8
4,199412,63.0,-0.1


In [6]:
# the display of time period was like: "1990/12 - 1991/12", so we make them to "1990" format
for i in State:
    df[i] = df[i].drop(['Anomaly'],axis=1)
    for index, row in df[i].iterrows():
        df[i].iloc[index,0] = math.floor(df[i].iloc[index,0]/100)

In [7]:
# pick 'Alabama' as example:
df['Alabama'].head()

Unnamed: 0,Date,Value
0,1990,64.9
1,1991,64.0
2,1992,62.1
3,1993,62.3
4,1994,63.0


### Now we have 49 dataframes, we need to combined them

In [8]:
# create a new dataframe to store, each row corresponding to a year
new_df = pd.DataFrame(columns=State)
index = []
for year in range(1990,2022):
    index.append(year)
new_df.insert(loc=0,column='Year',value=index)

In [9]:
# now all value in the data frame are NaN, we need to fill in
new_df.head()

Unnamed: 0,Year,South Dakota,Arkansas,Connecticut,Arizona,Oregon,Mississippi,Vermont,Ohio,Rhode Island,...,New Mexico,Virginia,Tennessee,Wisconsin,North Carolina,Massachusetts,Pennsylvania,Texas,Alaska,Colorado
0,1990,,,,,,,,,,...,,,,,,,,,,
1,1991,,,,,,,,,,...,,,,,,,,,,
2,1992,,,,,,,,,,...,,,,,,,,,,
3,1993,,,,,,,,,,...,,,,,,,,,,
4,1994,,,,,,,,,,...,,,,,,,,,,


In [10]:
# put the value into new dataframe
for state in State:
    new_df[state] = df[state]['Value']

In [11]:
new_df.head()

Unnamed: 0,Year,South Dakota,Arkansas,Connecticut,Arizona,Oregon,Mississippi,Vermont,Ohio,Rhode Island,...,New Mexico,Virginia,Tennessee,Wisconsin,North Carolina,Massachusetts,Pennsylvania,Texas,Alaska,Colorado
0,1990,46.9,61.8,51.1,59.7,47.4,65.1,44.0,52.7,51.5,...,53.3,57.4,59.4,44.5,61.1,49.7,50.7,65.6,25.4,45.6
1,1991,46.8,61.3,51.0,59.2,47.6,64.3,43.5,53.1,51.4,...,52.5,57.0,59.1,43.7,60.2,49.6,50.7,64.8,27.0,45.0
2,1992,45.6,59.9,47.7,59.6,49.4,62.8,40.6,50.3,48.4,...,52.6,54.4,57.0,42.7,58.2,46.4,47.6,64.4,25.0,45.3
3,1993,42.3,59.3,48.6,59.8,45.5,62.7,41.2,50.4,49.3,...,53.1,55.1,57.2,41.7,58.7,47.3,48.2,64.1,29.9,43.8
4,1994,44.9,60.4,48.9,60.6,47.8,63.5,41.3,50.5,49.6,...,54.4,55.3,57.6,43.0,59.0,47.7,48.0,65.6,26.2,46.5


In [12]:
# Now we have cleanned data, save it into processed file
path = '../../processed/climate_state.csv'
new_df.to_csv(path)