In [1]:
import pandas as pd

In [2]:
#read csv from an url
url = 'https://www.ncei.noaa.gov/orders/cdo/1881585.txt'
data = pd.read_csv(url, sep = "\s+", skiprows = [1],header = 0, na_values=-9999)

In [3]:
#variable declaration
#saving the csv´s name in a variavle
fname = 'helsinki.csv'
#getting the dataframe
datahelsk = pd.read_csv(fname)
datahelsk['STATION'] = 'Helsinki'

In [4]:
#making the dataframe easier to use
data = data.reset_index()

In [5]:
#making the correct columns and eliminating the useless
data['STATION_NAME'] = data['level_1']+' '+data['STATION']+' '+data['STATION_NAME']
data['STATION'] = data['level_0']
data = data.drop(['level_0','level_1'], axis = 1)
data.head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,TMIN
0,GHCND:FIE00146538,SODANKYLA LOKKA FI,240,67.8206,27.7503,19590101,0.03,,9.0
1,GHCND:FIE00146538,SODANKYLA LOKKA FI,240,67.8206,27.7503,19590102,0.0,,6.0
2,GHCND:FIE00146538,SODANKYLA LOKKA FI,240,67.8206,27.7503,19590103,0.02,,-9.0
3,GHCND:FIE00146538,SODANKYLA LOKKA FI,240,67.8206,27.7503,19590104,0.08,,10.0
4,GHCND:FIE00146538,SODANKYLA LOKKA FI,240,67.8206,27.7503,19590105,0.09,,13.0


# Calculate the average temperature using columns TMAX and TMIN and insert those values into a new column called TAVG.

In [6]:
#getting temperature average in fahrenheit and celsius
data['TAVG'] = (data['TMAX']+data['TMIN'])/2
data['temp_celsius'] = (data['TAVG']-32)*(5/9)

In [7]:
#eleminating useless columns
data = data.drop(['ELEVATION','LATITUDE','LONGITUDE','PRCP'],axis = 1)

In [8]:
data.head()

Unnamed: 0,STATION,STATION_NAME,DATE,TMAX,TMIN,TAVG,temp_celsius
0,GHCND:FIE00146538,SODANKYLA LOKKA FI,19590101,,9.0,,
1,GHCND:FIE00146538,SODANKYLA LOKKA FI,19590102,,6.0,,
2,GHCND:FIE00146538,SODANKYLA LOKKA FI,19590103,,-9.0,,
3,GHCND:FIE00146538,SODANKYLA LOKKA FI,19590104,,10.0,,
4,GHCND:FIE00146538,SODANKYLA LOKKA FI,19590105,,13.0,,


In [9]:
#getting yearMonth for then correctly grouping
data['DATE_m'] = data['DATE'].astype(str).str.slice(start=0,stop=6).astype(int)

In [10]:
data = data.drop(['DATE'],axis=1)

In [11]:
grouped = data.groupby('DATE_m')
data = grouped.mean()

In [12]:
data = data.drop(['TMAX','TMIN'],axis=1)

In [13]:
data = data.reset_index(drop=False)
data['Month'] = data['DATE_m'].astype(str).str.slice(start=4,stop=6).astype(int)
data.head()

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month
0,195901,,,1
1,195902,,,2
2,195903,,,3
3,195904,,,4
4,195905,41.467742,5.259857,5


# Calculate the temperature anomalies in Sodankyla, i.e. the difference between referenceTemps and the average temperature for each month.

In [14]:
#group to get the reference temperature in each month
group_reftemp = data.groupby('Month')

In [15]:
#getting the reference temperature
val_reftemp = group_reftemp['temp_celsius'].mean()
#getting outliners
val_outliner_upper = 2*(group_reftemp['temp_celsius'].std())+group_reftemp['temp_celsius'].mean()
val_outliner_lower = -2*(group_reftemp['temp_celsius'].std())+group_reftemp['temp_celsius'].mean()

In [16]:
#assigning the ref_temp and outliners of each month
for idx, row in data.iterrows():
    month = row['Month']
    if month ==1:
        data.loc[idx, 'ref_temp'] = val_reftemp[1]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[1]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[1]
    if month == 2:
        data.loc[idx, 'ref_temp'] = val_reftemp[2]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[2]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[2]
    if month == 3:
        data.loc[idx, 'ref_temp'] = val_reftemp[3]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[3]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[3]
    if month ==4:
        data.loc[idx, 'ref_temp'] = val_reftemp[4]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[4]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[4]
    if month ==5:
        data.loc[idx, 'ref_temp'] = val_reftemp[5]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[5]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[5]
    if month == 6:
        data.loc[idx, 'ref_temp'] = val_reftemp[6]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[6]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[6]
    if month == 7:
        data.loc[idx, 'ref_temp'] = val_reftemp[7]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[7]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[7]
    if month ==8:
        data.loc[idx, 'ref_temp'] = val_reftemp[8]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[8]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[8]
    if month ==9:
        data.loc[idx, 'ref_temp'] = val_reftemp[9]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[9]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[9]
    if month == 10:
        data.loc[idx, 'ref_temp'] = val_reftemp[10]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[10]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[10]
    if month == 11:
        data.loc[idx, 'ref_temp'] = val_reftemp[11]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[11]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[11]
    if month ==12:
        data.loc[idx, 'ref_temp'] = val_reftemp[12]
        data.loc[idx,'upper_outliner'] = val_outliner_upper[12]
        data.loc[idx,'lower_outliner'] = val_outliner_lower[12]

In [17]:
data['outliner'] = ((data['temp_celsius'] > data['upper_outliner']) | (data['temp_celsius'] < data['lower_outliner']))

In [18]:
data.head(12)

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month,ref_temp,upper_outliner,lower_outliner,outliner
0,195901,,,1,-14.708388,-7.142686,-22.274091,False
1,195902,,,2,-14.130261,-5.51627,-22.744252,False
2,195903,,,3,-9.546836,-2.211097,-16.882575,False
3,195904,,,4,-3.067089,1.534371,-7.668549,False
4,195905,41.467742,5.259857,5,3.889291,8.039371,-0.26079,False
5,195906,52.083333,11.157407,6,10.392185,13.956748,6.827621,False
6,195907,54.95,12.75,7,13.516478,16.979224,10.053732,False
7,195908,52.564516,11.424731,8,10.97593,13.447335,8.504525,False
8,195909,38.833333,3.796296,9,5.771552,8.906087,2.637016,False
9,195910,28.370968,-2.016129,10,-0.862996,4.337056,-6.063048,False


In [19]:
outliner = data.loc[data['outliner'] == True]

In [20]:
outliner.head()

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month,ref_temp,upper_outliner,lower_outliner,outliner
21,196010,20.290323,-6.505376,10,-0.862996,4.337056,-6.063048,True
33,196110,40.919355,4.955197,10,-0.862996,4.337056,-6.063048,True
38,196203,-1.596774,-18.664875,3,-9.546836,-2.211097,-16.882575,True
52,196305,49.370968,9.650538,5,3.889291,8.039371,-0.26079,True
85,196602,-11.946429,-24.414683,2,-14.130261,-5.51627,-22.744252,True


In [21]:
data = data.drop(['upper_outliner','lower_outliner','outliner'], axis=1)
data.head()

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month,ref_temp
0,195901,,,1,-14.708388
1,195902,,,2,-14.130261
2,195903,,,3,-9.546836
3,195904,,,4,-3.067089
4,195905,41.467742,5.259857,5,3.889291


In [22]:
data['Diff'] = -data['ref_temp']+data['temp_celsius']

In [23]:
data['STATION'] = 'SODANKYLA LOKKA FI'


In [24]:
data.head()

Unnamed: 0,DATE_m,TAVG,temp_celsius,Month,ref_temp,Diff,STATION
0,195901,,,1,-14.708388,,SODANKYLA LOKKA FI
1,195902,,,2,-14.130261,,SODANKYLA LOKKA FI
2,195903,,,3,-9.546836,,SODANKYLA LOKKA FI
3,195904,,,4,-3.067089,,SODANKYLA LOKKA FI
4,195905,41.467742,5.259857,5,3.889291,1.370566,SODANKYLA LOKKA FI


In [25]:
#to use in the bonus
data.to_csv('Sodankyla_data.csv')

# Calculate the monthly temperature differences between Sodankyla and Helsinki stations

In [26]:
#merging the dataframes to have the correct rows because there is some years that Sodankyla does not have and
#this merge only gets the rows that have a value in both dataframes
finaldataF = pd.merge(data,datahelsk,on = ['DATE_m','Month'])
finaldataF = finaldataF.drop(['STATION_x','STATION_y'],axis=1)
finaldataF.columns = ['Date_month','TAVG_SODANKYLA','temp_celsius_SODANKYLA','Month','ref_temp_SODANKYLA','DIFF_SODANKYLA','TAVG_KELSINKI','temp_celsius_HELSINKI','ref_temp_HELSINKI','DIFF_HELSINKI']
finaldataF['Monthly_diff'] = finaldataF['temp_celsius_SODANKYLA']-finaldataF['temp_celsius_HELSINKI']

In [27]:
#getting the mean of all data in each month
finaldataF = finaldataF.groupby('Month').mean()

In [28]:
#eliminating row that contains year and month cause now are useless
finaldataF =finaldataF.drop(['Date_month'],axis=1)

In [29]:
finaldataF

Unnamed: 0_level_0,TAVG_SODANKYLA,temp_celsius_SODANKYLA,ref_temp_SODANKYLA,DIFF_SODANKYLA,TAVG_KELSINKI,temp_celsius_HELSINKI,ref_temp_HELSINKI,DIFF_HELSINKI,Monthly_diff
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,5.468622,-14.739655,-14.708388,-0.031266,22.023532,-5.542447,-5.351,-0.191596,-8.9579
2,6.599407,-14.111441,-14.130261,0.01882,21.63617,-5.757617,-5.941,0.183532,-7.691752
3,14.939785,-9.477897,-9.546836,0.068939,28.18934,-2.117021,-2.44,0.323298,-6.989069
4,26.344545,-3.141919,-3.067089,-0.07483,38.70917,3.727362,3.424,0.303468,-6.552562
5,38.854435,3.80802,3.889291,-0.081271,50.845426,10.469681,10.18,0.28983,-6.343318
6,50.715789,10.397661,10.392185,0.005476,58.491521,14.717521,14.668,0.049646,-4.320337
7,56.246718,13.470399,13.516478,-0.04608,63.230542,17.350292,17.281,0.069833,-3.784002
8,51.708263,10.949035,10.97593,-0.026895,60.310938,15.728333,15.604,0.124625,-4.756111
9,42.34386,5.746589,5.771552,-0.024963,51.279771,10.710979,10.596,0.114854,-4.769042
10,30.457555,-0.856914,-0.862996,0.006082,42.059667,5.588667,5.488,0.100958,-6.250629


In [30]:
#saving the dataframe in a csv
finaldataF.to_csv('Month_relationship_helsk_lokka.csv')

# How different the summer temperatures (June, July, August) have been between Helsinki and Sodankyla station?
# What were the summer mean temperatures for both of these stations?
# What were the summer standard deviations for both of these stations?

In [31]:
#getting all the asked in the last two poins
summer_mean_helsinki = finaldataF['temp_celsius_HELSINKI'][5:8].mean()
summer_std_helsinki = finaldataF['temp_celsius_HELSINKI'][5:8].std()
summer_mean_sodankyla = finaldataF['temp_celsius_SODANKYLA'][5:8].mean()
summer_std_sodankyla = finaldataF['temp_celsius_SODANKYLA'][5:8].std()
diff_jun = finaldataF['Monthly_diff'][6]
diff_jul = finaldataF['Monthly_diff'][7]
diff_aug = finaldataF['Monthly_diff'][8]

#The difference means that if is closer to 0 is that the temperatures have been similar and further is less similar
print('The difference in June have been: ', diff_jun)
print('The difference in July have been: ', diff_jul)
print('The difference in August have been: ', diff_aug)
print("The helsinki´s summer mean is: ",summer_mean_helsinki)
print("The helsinki´s summer std is: ",summer_std_helsinki)
print("The sodankyla´s summer mean is: ",summer_mean_sodankyla)
print("The sodankyla´s summer std is: ",summer_std_sodankyla)
print(finaldataF)

The difference in June have been:  -4.320337191358024
The difference in July have been:  -3.7840020908004774
The difference in August have been:  -4.756111111111111
The helsinki´s summer mean is:  15.932048611111112
The helsinki´s summer std is:  1.3281549235735783
The sodankyla´s summer mean is:  11.605698086314952
The sodankyla´s summer std is:  1.638241282879079
       TAVG_SODANKYLA  temp_celsius_SODANKYLA  ref_temp_SODANKYLA  \
Month                                                               
1            5.468622              -14.739655          -14.708388   
2            6.599407              -14.111441          -14.130261   
3           14.939785               -9.477897           -9.546836   
4           26.344545               -3.141919           -3.067089   
5           38.854435                3.808020            3.889291   
6           50.715789               10.397661           10.392185   
7           56.246718               13.470399           13.516478   
8          