In [4]:
#Numpy exercises
import numpy as np

x = np.array([1,2,3,4,5])
y = np.array([6,7,8,9,10])

#Ex.1: inner product
def inner(x, y):
    return np.sum(x*y)

inner_prod = inner(x,y)
inner_prod

#check
inner(x,y) == np.inner(x,y)

True

In [17]:
# Ex.2: MAE
mae = np.abs(np.sum(y-x)) / x.shape[0]
mae

5.0

In [53]:
#Ex.3: lead and lag
def lead(x, n):
    na_array = np.array([np.nan for i in range(n)])
    return np.concatenate([x[n:], na_array])

def lag(x, n):
    na_array = np.array([np.nan for i in range(n)])
    return np.concatenate([na_array, x[:-n]])

print('x:', x)
print('lead(x,2):', lead(x,2))
print('lag(x,2):', lag(x,2))

x: [1 2 3 4 5]
lead(x,2): [  3.   4.   5.  nan  nan]
lag(x,2): [ nan  nan   1.   2.   3.]


In [4]:
def norm(x,y):
    return (np.sqrt(np.sum((x - y)**2, axis=1))).reshape(3,-1)
    
x = np.array([[1,2,3,4,5], [1,2,3,4,5], [3,4,5,1,4]])
y = np.array([1,5,2,7,1])
display(norm(x,y))

#check
np.linalg.norm(x-y, axis=1) == norm(x,y)

array([[ 5.91607978],
       [ 5.91607978],
       [ 7.68114575]])

array([[ True,  True, False],
       [ True,  True, False],
       [False, False,  True]], dtype=bool)

In [90]:
#Pandas exercises
import pandas as pd

df = pd.read_csv('/Users/davidemartinelli/Downloads/nycflights13_weather.csv', skiprows=42)
df.head()

Unnamed: 0,origin,year,month,day,hour,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib,time_hour
0,EWR,2013,1,1,0,37.04,21.92,53.97,230.0,10.35702,11.918651,0.0,1013.9,10.0,2013-01-01 01:00:00
1,EWR,2013,1,1,1,37.04,21.92,53.97,230.0,13.80936,15.891535,0.0,1013.0,10.0,2013-01-01 02:00:00
2,EWR,2013,1,1,2,37.94,21.92,52.09,230.0,12.65858,14.567241,0.0,1012.6,10.0,2013-01-01 03:00:00
3,EWR,2013,1,1,3,37.94,23.0,54.51,230.0,13.80936,15.891535,0.0,1012.7,10.0,2013-01-01 04:00:00
4,EWR,2013,1,1,4,37.94,24.08,57.04,240.0,14.96014,17.21583,0.0,1012.8,10.0,2013-01-01 05:00:00


In [91]:
#Ex.1
#Convert temperature
df['temp'] = (df['temp'] - 32) * 5 / 9

In [93]:
#daily mean temperatures
def change_na(x):
    '''function to remove NaNs from columns.
    NaNs are replaced by interpolation
    NaNs at the beginning or at the end of the series 
    are set equal to the nearest not NaN value (no linear interpolation)'''
    n = len(x)
    if x.isna().sum() == n: #if all values are NaN, then I cannot change them
        return x
    elif x.isna().sum() != 0: 
        if np.isnan(x[0]): #in case the series starts with a NaN
            j = 1
            while np.isnan(x[j]):
                j += 1
            for i in range(j):
                x[0 + j] = x[j]
        if np.isnan(x[n-1]): #in case the series ends with a NaN
            j = 1
            while np.isnan(x[n-1-j]):
                j += 1
            for i in range(j):
                x[n-1-i] = x[n-1-j]
            
        for i in range(1, n-1):
            if x.isna().sum() == 0:
                break
            if np.isnan(x[i]):
                j = 1
                while np.isnan(x[i+j]):
                    j += 1
                step = (x[i + j] - x[i-1]) / (j+1)
                for n in range(j):
                    x[i+n] = x[i+n-1] + step
    return x

#df.apply(change_na, axis=0) <- if I want to eliminate all NaNs from the whole df
df['temp'] = change_na(df['temp'])
df = df.groupby(['month', 'day']).mean()
df['temp'].head()

month  day
1      1      3.813043
       2     -1.833333
       3     -1.308333
       4      1.254167
       5      2.845833
Name: temp, dtype: float64

In [34]:
#days with greater temperature than preceding day
df.loc[df['temp'] > df['temp'].shift(1)].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,hour,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib
month,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,3,2013.0,11.5,-1.308333,14.505,53.269722,291.805556,10.053342,11.569185,0.0,1021.0125,10.0
1,4,2013.0,11.5,1.254167,18.955,53.470972,253.75,15.008089,17.271009,0.0,1017.5,10.0
1,5,2013.0,11.5,2.845833,19.11,48.178194,272.361111,12.003275,13.813128,0.0,1021.05,10.0
1,6,2013.0,11.5,3.541667,26.555,63.858889,234.722222,8.726748,10.042567,0.0,1019.860417,9.048611
1,7,2013.0,11.5,5.425,24.6325,51.489167,277.5,9.957444,11.458827,0.0,1022.9125,10.0


In [35]:
#five hottest days
df_five_hot = df.sort_values('temp', ascending=True).tail(5)
print('Five hottest days:')
for i in range(1,6):
    year = int(df_five_hot.iloc[-i][0])
    month = df_five_hot.iloc[-i].name[0]
    day = df_five_hot.iloc[-i].name[1]
    temp = df_five_hot.iloc[-i][2]
    print('{}) {}/{}/{}: {:.2f} Celsius degrees'.format(i, day, month, year, temp))
    

Five hottest days:
1) 19/7/2013: 32.23 Celsius degrees
2) 18/7/2013: 31.40 Celsius degrees
3) 20/7/2013: 30.91 Celsius degrees
4) 17/7/2013: 30.72 Celsius degrees
5) 16/7/2013: 30.55 Celsius degrees


In [37]:
#Ex.2
df = pd.read_csv('/Users/davidemartinelli/Downloads/nycflights13_flights.csv', skiprows=54)
df.describe() #some NaN

#select all columns between year and day
year_index = list(df.columns).index('year')
day_index = list(df.columns).index('day')
if year_index < day_index:
    df_inclusive = df.iloc[:,year_index:day_index+1]
else:
    df_inclusive = df.iloc[:,day_index:year_index+1]

df_inclusive.head()

Unnamed: 0,year,month,day
0,2013,1,1
1,2013,1,1
2,2013,1,1
3,2013,1,1
4,2013,1,1


In [38]:
#select all columns but those between year and day
if year_index < day_index:
    col_indexes = [i for i in range(year_index)] + [i for i in range(day_index+1, df.shape[1])]
    df_exclusive = df.iloc[:, col_indexes]
else:
    col_indexes = [i for i in range(day_index)] + [i for i in range(year_index+1, df.shape[1])] 
    df_exclusive = df.iloc[:, col_indexes]
    
df_exclusive.head()

Unnamed: 0,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


In [89]:
#Ex.3
A = pd.read_csv('/Users/davidemartinelli/Desktop/Bocconi/DSBA/Statistics/ProbabilityAndStatistics-master-2/Assignment_1/data/some_birth_dates1.csv')
B = pd.read_csv('/Users/davidemartinelli/Desktop/Bocconi/DSBA/Statistics/ProbabilityAndStatistics-master-2/Assignment_1/data/some_birth_dates2.csv')
C = pd.read_csv('/Users/davidemartinelli/Desktop/Bocconi/DSBA/Statistics/ProbabilityAndStatistics-master-2/Assignment_1/data/some_birth_dates3.csv')
union_ab = pd.concat([A,B]).drop_duplicates()
print('A or B')
display(union_ab)
print()
union_abc = pd.concat([A,B,C]).drop_duplicates()
print('A or B or C')
display(union_abc)
print()
intersection_ab = A.merge(B, how = 'inner')
print('A and B')
display(intersection_ab)
print()
intersection_ac = A.merge(C, how = 'inner')
print('A and C')
display(intersection_ac)
print()
l = B.Name.values
n = []
for i in range(len(A)):
    if A.Name[i] in l:
        n.append(i)
print('A minus B')
a_minus_b = A.drop(n)
display(a_minus_b)

A or B


Unnamed: 0,Name,BirthDate
0,Paitoon Ornwimol,26.06.1958
1,Antónia Lata,20.05.1935
2,Bertoldo Mallozzi,17.08.1972
3,Nedeljko Bukv,19.12.1921
4,Micha Kitchen,17.09.1930
5,Mefodiy Shachar,01.10.1914
6,Paul Meckler,29.09.1968
7,Katarzyna Lasko,20.10.1971
8,Åge Trelstad,07.03.1935
9,Duchanee Panomyaong,19.06.1952



A or B or C


Unnamed: 0,Name,BirthDate
0,Paitoon Ornwimol,26.06.1958
1,Antónia Lata,20.05.1935
2,Bertoldo Mallozzi,17.08.1972
3,Nedeljko Bukv,19.12.1921
4,Micha Kitchen,17.09.1930
5,Mefodiy Shachar,01.10.1914
6,Paul Meckler,29.09.1968
7,Katarzyna Lasko,20.10.1971
8,Åge Trelstad,07.03.1935
9,Duchanee Panomyaong,19.06.1952



A and B


Unnamed: 0,Name,BirthDate
0,Micha Kitchen,17.09.1930
1,Mefodiy Shachar,01.10.1914
2,Paul Meckler,29.09.1968
3,Katarzyna Lasko,20.10.1971
4,Åge Trelstad,07.03.1935
5,Duchanee Panomyaong,19.06.1952



A and C


Unnamed: 0,Name,BirthDate



A minus B


Unnamed: 0,Name,BirthDate
0,Paitoon Ornwimol,26.06.1958
1,Antónia Lata,20.05.1935
2,Bertoldo Mallozzi,17.08.1972
3,Nedeljko Bukv,19.12.1921
