# Introduction

In this NoteBook, we will clean the multiple data of temperature for each states, merge them into one master dataset.

# Set up

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime
import math
from glob import glob
from pathlib import Path
import os

# Cleaning 

**Check the file number in this directory**

In [3]:
files = glob('../Raw_Data/Climate/*.csv')
len(files)

49

**Read all csv file and concatenate into one dataframe**

In [4]:
files = Path('../Raw_Data/Climate').glob('*.csv') # get all csvs in your dir.

df = pd.DataFrame()
for file in files:
    data = pd.read_csv(file,skiprows=[0,1,2,3],delimiter=',')
    # Find the file name which is the year of current df, split with dot, year is the first part.
    stateName = os.path.basename(file).split('.')[0]
    # Add year as a new column
    data.insert(data.shape[1], 'State', stateName)
    # Concat together
    df = pd.concat([df,data])
df

Unnamed: 0,Date,Value,Anomaly,State
0,199012,65.7,2.3,Georgia
1,199112,64.5,1.1,Georgia
2,199212,62.7,-0.7,Georgia
3,199312,63.2,-0.2,Georgia
4,199412,63.6,0.2,Georgia
...,...,...,...,...
27,201712,51.5,2.9,Rhode Island
28,201812,51.6,3.0,Rhode Island
29,201912,50.6,2.0,Rhode Island
30,202012,52.8,4.2,Rhode Island


**The display of time period was like: "1990/12 - 2021/12"(year and month), so we make them to "1990" format(year only)**

In [5]:
df['Date'] = round(df['Date']/100).astype(int)
df.head()

Unnamed: 0,Date,Value,Anomaly,State
0,1990,65.7,2.3,Georgia
1,1991,64.5,1.1,Georgia
2,1992,62.7,-0.7,Georgia
3,1993,63.2,-0.2,Georgia
4,1994,63.6,0.2,Georgia


**Drop 'Anomaly' column**

In [6]:
df.drop(['Anomaly'],axis=1,inplace=True)

**Change the format of dataframe by using pivot**

In [7]:
df = df.pivot(index ='State', columns ='Date')['Value']
df.head()

Date,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,64.9,64.0,62.1,62.3,63.0,63.0,62.1,62.4,65.1,64.1,...,64.9,62.6,62.0,64.6,65.3,65.0,64.5,65.4,64.9,64.2
Alaska,25.4,27.0,25.0,29.9,26.2,27.8,25.1,28.1,28.4,24.0,...,24.3,27.8,30.3,30.0,31.9,29.3,30.4,32.2,27.5,26.5
Arizona,59.7,59.2,59.6,59.8,60.6,61.0,61.9,60.5,59.3,60.8,...,62.1,60.4,62.2,61.8,62.2,63.0,62.3,60.3,62.6,62.1
Arkansas,61.8,61.3,59.9,59.3,60.4,60.5,59.6,59.7,63.1,62.1,...,63.6,59.6,58.7,61.4,62.8,62.5,61.2,61.2,61.1,61.2
California,58.0,58.1,59.3,57.7,58.0,58.9,59.6,59.1,56.7,58.0,...,59.5,59.3,61.4,60.8,60.1,60.3,60.1,58.4,60.5,60.3


**Remove columns name**

In [8]:
df.columns.name = None
df.head()

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,64.9,64.0,62.1,62.3,63.0,63.0,62.1,62.4,65.1,64.1,...,64.9,62.6,62.0,64.6,65.3,65.0,64.5,65.4,64.9,64.2
Alaska,25.4,27.0,25.0,29.9,26.2,27.8,25.1,28.1,28.4,24.0,...,24.3,27.8,30.3,30.0,31.9,29.3,30.4,32.2,27.5,26.5
Arizona,59.7,59.2,59.6,59.8,60.6,61.0,61.9,60.5,59.3,60.8,...,62.1,60.4,62.2,61.8,62.2,63.0,62.3,60.3,62.6,62.1
Arkansas,61.8,61.3,59.9,59.3,60.4,60.5,59.6,59.7,63.1,62.1,...,63.6,59.6,58.7,61.4,62.8,62.5,61.2,61.2,61.1,61.2
California,58.0,58.1,59.3,57.7,58.0,58.9,59.6,59.1,56.7,58.0,...,59.5,59.3,61.4,60.8,60.1,60.3,60.1,58.4,60.5,60.3


# Output Cleaned Data Version

In [9]:
path = '../Processed_Data/climate_state.csv'
df.to_csv(path)