In [1]:
import pandas as pd

In [2]:
#read csv from an url
url = 'https://www.ncei.noaa.gov/orders/cdo/1883795.txt'
data = pd.read_csv(url, sep = "\s+", skiprows = [1],header = 0, na_values=-9999)

In [3]:
#getting helsinki dataframe
#variable declaration
#saving the csv´s name in a variable
fname = 'helsinki.csv'
#getting the dataframe
datahelsk = pd.read_csv(fname)
datahelsk['STATION'] = 'Helsinki'

In [4]:
#getting sidankyla dataframe
#variable declaration
#saving the csv´s name in a variable
fname2 = 'Sodankyla_data.csv'
#getting the dataframe
datasoda = pd.read_csv(fname2,index_col=0)
datasoda['STATION'] = 'Sodankyla'

In [5]:
#making the dataframe easier to use
data = data.reset_index()
#making the correct columns and eliminating the useless
data['STATION_NAME'] = data['level_1']+' '+data['STATION']+' '+data['STATION_NAME']
data['STATION'] = data['level_0']
data = data.drop(['level_0','level_1'], axis = 1)

In [6]:
#getting temperature average in fahrenheit and celsius
data['temp_celsius'] = (data['TAVG']-32)*(5/9)
data.head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,PRCP,SNWD,TAVG,TMAX,TMIN,temp_celsius
0,GHCND:MXM00076680,MEXICO CITY MX,2303,19.4,-99.183,19730102,,,64.0,,,17.777778
1,GHCND:MXM00076680,MEXICO CITY MX,2303,19.4,-99.183,19730111,,,67.0,,,19.444444
2,GHCND:MXM00076680,MEXICO CITY MX,2303,19.4,-99.183,19730218,,,68.0,,,20.0
3,GHCND:MXM00076680,MEXICO CITY MX,2303,19.4,-99.183,19730225,,,68.0,,,20.0
4,GHCND:MXM00076680,MEXICO CITY MX,2303,19.4,-99.183,19730410,,,62.0,,,16.666667


In [7]:
#eleminating useless columns
data = data.drop(['ELEVATION','LATITUDE','LONGITUDE','PRCP','SNWD'],axis = 1)

In [8]:
#getting yearMonth for then correctly grouping
data['DATE_m'] = data['DATE'].astype(str).str.slice(start=0,stop=6).astype(int)
data = data.drop(['DATE'],axis=1)

In [9]:
#groupying by yearMonth and getting the mean
grouped = data.groupby('DATE_m')
data = grouped.mean()

In [10]:
#eliminating now more useful columns
data = data.drop(['TMAX','TMIN'],axis=1)
data.head()

Unnamed: 0_level_0,TAVG,temp_celsius
DATE_m,Unnamed: 1_level_1,Unnamed: 2_level_1
197301,65.5,18.611111
197302,68.0,20.0
197304,62.0,16.666667
197508,62.375,16.875
197509,59.785714,15.436508


In [11]:
#eliminating the groups of the index and converting into a column and getting month
data = data.reset_index(drop=False)
data['Month'] = data['DATE_m'].astype(str).str.slice(start=4,stop=6).astype(int)
data.head()

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month
0,197301,65.5,18.611111,1
1,197302,68.0,20.0,2
2,197304,62.0,16.666667,4
3,197508,62.375,16.875,8
4,197509,59.785714,15.436508,9


# Calculate the temperature anomalies in Sodankyla, i.e. the difference between referenceTemps and the average temperature for each month.

In [12]:
#group to get the reference temperature in each month
group_reftemp = data.groupby('Month')
#getting the reference temperature
val_reftemp = group_reftemp['temp_celsius'].mean()
#getting outliners
val_outliner_upper = 2*(group_reftemp['temp_celsius'].std())+group_reftemp['temp_celsius'].mean()
val_outliner_lower = -2*(group_reftemp['temp_celsius'].std())+group_reftemp['temp_celsius'].mean()

In [13]:
#assigning the ref_temp and outlinersof each month 
for idx, row in data.iterrows():
    month = row['Month']
    if month ==1:
        data.loc[idx, 'ref_temp'] = val_reftemp[1]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[1]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[1]
    if month == 2:
        data.loc[idx, 'ref_temp'] = val_reftemp[2]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[2]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[2]
    if month == 3:
        data.loc[idx, 'ref_temp'] = val_reftemp[3]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[3]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[3]
    if month ==4:
        data.loc[idx, 'ref_temp'] = val_reftemp[4]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[4]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[4]
    if month ==5:
        data.loc[idx, 'ref_temp'] = val_reftemp[5]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[5]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[5]
    if month == 6:
        data.loc[idx, 'ref_temp'] = val_reftemp[6]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[6]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[6]
    if month == 7:
        data.loc[idx, 'ref_temp'] = val_reftemp[7]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[7]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[7]
    if month ==8:
        data.loc[idx, 'ref_temp'] = val_reftemp[8]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[8]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[8]
    if month ==9:
        data.loc[idx, 'ref_temp'] = val_reftemp[9]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[9]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[9]
    if month == 10:
        data.loc[idx, 'ref_temp'] = val_reftemp[10]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[10]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[10]
    if month == 11:
        data.loc[idx, 'ref_temp'] = val_reftemp[11]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[11]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[11]
    if month ==12:
        data.loc[idx, 'ref_temp'] = val_reftemp[12]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[12]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[12]

In [14]:
data['outliner'] = ((data['temp_celsius'] > data['upper_outliner']) | (data['temp_celsius'] < data['lower_outliner']))

In [15]:
outliner = data.loc[data['outliner'] == True].reset_index(drop=True)
data = data.drop(['upper_outliner','lower_outliner','outliner'], axis=1)

In [16]:
outliner

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month,ref_temp,upper_outliner,lower_outliner,outliner
0,197301,65.5,18.611111,1,14.5223,17.447475,11.597124,True
1,197302,68.0,20.0,2,16.131664,19.664923,12.598405,True
2,197512,53.842105,12.134503,12,14.787665,16.901101,12.674229,True
3,197602,53.5,11.944444,2,16.131664,19.664923,12.598405,True
4,197608,59.571429,15.31746,8,17.378916,19.351151,15.406682,True
5,199103,69.481481,20.823045,3,18.051736,20.818922,15.28455,True
6,200011,65.833333,18.796296,11,15.728881,18.25533,13.202432,True
7,200104,78.0,25.555556,4,19.41528,23.453692,15.376868,True
8,200105,57.0,13.888889,5,19.538674,23.236445,15.840903,True
9,200206,73.0,22.777778,6,18.52133,21.376702,15.665958,True


In [17]:
#getting diff column
data['Diff'] = -data['ref_temp']+data['temp_celsius']

# Calculate the monthly temperature differences between Sodankyla and Helsinki stations

In [18]:
#combaining two dataframes
helsk_soda = pd.merge(datasoda,datahelsk, on = ['DATE_m','Month'])
helsk_soda.head()

Unnamed: 0,DATE_m,TAVG_x,temp_celsius_x,Month,ref_temp_x,Diff_x,STATION_x,TAVG_y,temp_celsius_y,ref_temp_y,Diff_y,STATION_y
0,195901,,,1,-14.708388,,Sodankyla,22.733,-5.148,-5.351,0.203,Helsinki
1,195902,,,2,-14.130261,,Sodankyla,27.75,-2.361,-5.941,3.58,Helsinki
2,195903,,,3,-9.546836,,Sodankyla,32.581,0.323,-2.44,2.763,Helsinki
3,195904,,,4,-3.067089,,Sodankyla,38.967,3.87,3.424,0.447,Helsinki
4,195905,41.467742,5.259857,5,3.889291,1.370566,Sodankyla,49.452,9.695,10.18,-0.485,Helsinki


In [19]:
#eliminating useless columns and renaming correctly
helk_soda = helsk_soda.drop(['STATION_x','STATION_y'],axis=1)
helk_soda.columns = ['DATE_m','TAVG_SODANKYLA','temp_celsius_SODANKYLA','Month','ref_temp_SODANKYLA','DIFF_SODANKYLA','TAVG_KELSINKI','temp_celsius_HELSINKI','ref_temp_HELSINKI','DIFF_HELSINKI']
helk_soda.head()

Unnamed: 0,DATE_m,TAVG_SODANKYLA,temp_celsius_SODANKYLA,Month,ref_temp_SODANKYLA,DIFF_SODANKYLA,TAVG_KELSINKI,temp_celsius_HELSINKI,ref_temp_HELSINKI,DIFF_HELSINKI
0,195901,,,1,-14.708388,,22.733,-5.148,-5.351,0.203
1,195902,,,2,-14.130261,,27.75,-2.361,-5.941,3.58
2,195903,,,3,-9.546836,,32.581,0.323,-2.44,2.763
3,195904,,,4,-3.067089,,38.967,3.87,3.424,0.447
4,195905,41.467742,5.259857,5,3.889291,1.370566,49.452,9.695,10.18,-0.485


In [20]:
#conbaining the three dataframes
finaldata = pd.merge(data,helk_soda, on=['DATE_m','Month'])

In [21]:
#renaming columns to be more understandable
finaldata = finaldata.rename(columns={'TAVG':'TAVG_CDMX', 'temp_celsius':'temp_celsius_CDMX','ref_temp':'ref_temp_CDMX','Diff':'Diff_CDMX'})

In [22]:
finaldata = finaldata.drop(['DATE_m'],axis=1)


In [23]:
finaldata['Monthly_diff_helsk-Soda'] = finaldata['temp_celsius_SODANKYLA']-finaldata['temp_celsius_HELSINKI']
finaldata['Monthly_diff_helsk-CDMX'] = finaldata['temp_celsius_HELSINKI']-finaldata['temp_celsius_CDMX']
finaldata['Monthly_diff_CDMX-Soda'] = finaldata['temp_celsius_CDMX']-finaldata['temp_celsius_SODANKYLA']
#getting the mean of all data in each month
finaldata = finaldata.groupby('Month').mean()
#saving the dataframe in a csv
finaldata.to_csv('Month_relationship_helsk_lokka')

In [24]:
finaldata

Unnamed: 0_level_0,TAVG_CDMX,temp_celsius_CDMX,ref_temp_CDMX,Diff_CDMX,TAVG_SODANKYLA,temp_celsius_SODANKYLA,ref_temp_SODANKYLA,DIFF_SODANKYLA,TAVG_KELSINKI,temp_celsius_HELSINKI,ref_temp_HELSINKI,DIFF_HELSINKI,Monthly_diff_helsk-Soda,Monthly_diff_helsk-CDMX,Monthly_diff_CDMX-Soda
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,57.911631,14.39535,14.5223,-0.126949,5.766749,-14.574028,-14.708388,0.13436,22.020667,-5.544051,-5.351,-0.193205,-9.029977,-19.939402,28.969378
2,60.690496,15.939164,16.131664,-0.1925,7.893833,-13.392315,-14.130261,0.737946,21.370317,-5.905341,-5.941,0.035829,-7.486973,-21.844506,29.331479
3,64.330812,17.961562,18.051736,-0.090174,16.250269,-8.749851,-9.546836,0.796985,28.6999,-1.83335,-2.44,0.6069,-6.916501,-19.794912,26.711413
4,67.037931,19.465517,19.41528,0.050237,26.597436,-3.001425,-3.067089,0.065665,38.799949,3.777795,3.424,0.353872,-6.779219,-15.687723,22.466942
5,67.033985,19.463325,19.538674,-0.075349,39.102124,3.945625,3.889291,0.056334,50.859951,10.477805,10.18,0.297902,-6.53218,-8.98552,15.5177
6,65.297037,18.498354,18.52133,-0.022976,50.782937,10.434965,10.392185,0.04278,58.466738,14.703738,14.668,0.035881,-4.268773,-3.794616,8.063389
7,62.957849,17.198805,17.243836,-0.045031,56.183786,13.435437,13.516478,-0.081042,63.044158,17.246711,17.281,-0.033737,-3.811274,0.047905,3.763368
8,63.204364,17.335758,17.378916,-0.043158,51.783026,10.99057,10.97593,0.01464,60.411667,15.784262,15.604,0.180595,-4.793692,-1.551496,6.345188
9,62.612816,17.00712,17.070787,-0.063667,42.653659,5.918699,5.771552,0.147147,51.309756,10.727659,10.596,0.131537,-4.808959,-6.279461,11.088421
10,61.744223,16.524568,16.544484,-0.019916,30.87596,-0.624467,-0.862996,0.238529,41.883071,5.490548,5.488,0.002881,-6.115014,-11.03402,17.149035


# How different the summer temperatures (June, July, August) have been between Helsinki and Sodankyla station?
# Calculate the monthly differences into a DataFrame and save it (as CSV file) into your own Exercise repository for this week
# What were the summer mean temperatures for both of these stations?
# What were the summer standard deviations for both of these stations?


In [25]:
#getting all the asked in the last two poins
summer_mean_helsinki = finaldata['temp_celsius_HELSINKI'][5:8].mean()
summer_std_helsinki = finaldata['temp_celsius_HELSINKI'][5:8].std()
summer_mean_sodankyla = finaldata['temp_celsius_SODANKYLA'][5:8].mean()
summer_std_sodankyla = finaldata['temp_celsius_SODANKYLA'][5:8].std()
summer_mean_cdmx = finaldata['temp_celsius_CDMX'][5:8].mean()
summer_std_cdmx = finaldata['temp_celsius_CDMX'][5:8].std()
#difference between helsikin and Sodankyla
diff_jun = finaldata['Monthly_diff_helsk-Soda'][6]
diff_jul = finaldata['Monthly_diff_helsk-Soda'][7]
diff_aug = finaldata['Monthly_diff_helsk-Soda'][8]
#diferrence between Mexico city and Soankyla
diff_jun_2 = finaldata['Monthly_diff_CDMX-Soda'][6]
diff_jul_2 = finaldata['Monthly_diff_CDMX-Soda'][7]
diff_aug_2 = finaldata['Monthly_diff_CDMX-Soda'][8]
#difference between Mexico city and helsikin
diff_jun_3 = finaldata['Monthly_diff_helsk-CDMX'][6]
diff_jul_3 = finaldata['Monthly_diff_helsk-CDMX'][7]
diff_aug_3 = finaldata['Monthly_diff_helsk-CDMX'][8]

#The difference means that if is closer to 0 is that the temperatures have been similar and further is less similar
print('The difference in June between Helsinki and Sodankyla has been: ', diff_jun)
print('The difference in July between Helsinki and Sodankyla has been: ', diff_jul)
print('The difference in August between Helsinki and Sodankyla has been: ', diff_aug)
print('The difference in June between Mexico city and Sodankyla has been: ', diff_jun_2)
print('The difference in July between Mexico city and Sodankyla has been: ', diff_jul_2)
print('The difference in August between Mexico city and Sodankyla has been: ', diff_aug_2)
print('The difference in June between Helsinki and Mexico city has been: ', diff_jun_3)
print('The difference in July between Helsinki and Mexico city has been: ', diff_jul_3)
print('The difference in August between Helsinki and Mexico city has been: ', diff_aug_3)
print("The helsinki summer mean is: ",summer_mean_helsinki)
print("The helsinki summer std is: ",summer_std_helsinki)
print("The sodankyla summer mean is: ",summer_mean_sodankyla)
print("The sodankyla summer std is: ",summer_std_sodankyla)
print("The Mexico city summer mean is: ",summer_mean_cdmx)
print("The Mexico city summer std is: ",summer_std_cdmx)
print(finaldata)

The difference in June between Helsinki and Sodankyla has been:  -4.268773368606702
The difference in July between Helsinki and Sodankyla has been:  -3.8112738162610813
The difference in August between Helsinki and Sodankyla has been:  -4.7936918416111975
The difference in June between Mexico city and Sodankyla has been:  8.063389185228763
The difference in July between Mexico city and Sodankyla has been:  3.7633683854764586
The difference in August between Mexico city and Sodankyla has been:  6.345187911066053
The difference in June between Helsinki and Mexico city has been:  -3.7946158166220583
The difference in July between Helsinki and Mexico city has been:  0.04790543078462448
The difference in August between Helsinki and Mexico city has been:  -1.5514960694548567
The helsinki summer mean is:  15.911570175438598
The helsinki summer std is:  1.2762573185460353
The sodankyla summer mean is:  11.620323833278936
The sodankyla summer std is:  1.5962927015651267
The Mexico city summer m