In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

Data analysis

In [54]:
data = pd.read_csv('countriesData.csv')
data.head()

Unnamed: 0,country,continent,year,lifeExpectancy,population,gdpPerCapita
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [55]:
data.drop(['continent'], axis=1)

data.corr()
#ВВП

Unnamed: 0,year,lifeExpectancy,population,gdpPerCapita
year,1.0,0.435611,0.082308,0.227318
lifeExpectancy,0.435611,1.0,0.064955,0.583706
population,0.082308,0.064955,1.0,-0.0256
gdpPerCapita,0.227318,0.583706,-0.0256,1.0


In [56]:
print(data.shape)
print('*' * 40)
data.info()

(1704, 6)
****************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country           1704 non-null object
continent         1704 non-null object
year              1704 non-null int64
lifeExpectancy    1704 non-null float64
population        1704 non-null int64
gdpPerCapita      1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 66.6+ KB


In [57]:
def load_datasets(filename,data_columns,target_columns):
    
    df = pd.read_csv(filename, delimiter=',')
    X = df[data_columns]
    y = df[target_columns]
    return X,y

In [58]:
def split_datasets(X, y, test_size = 0.2):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    return X_train, X_test, y_train, y_test

In [59]:
def normalize(X):

    mean = np.mean(X)
    std = np.std(X)
    X_new = (X-mean)/std
    
    return X_new, mean, std

In [60]:
def prepare_X(X):

    m = X.shape[0]
    ones = np.ones((m, 1))
    X_new = np.array(X[:])
    X_new = np.column_stack((ones, X_new))
    
    return X_new

In [61]:
def hypothesis(X, theta):
    
    h_thetha = np.dot(X,theta)

    return h_thetha

In [62]:
def cost_function(X, y, theta):
  
    m = X.shape[0]
    if m == 0:
        return None
    
    J = 1 / (2 * m) * sum((hypothesis(X,theta) - y)**2)
 
    return J

In [63]:
def derivative(X, y, thetha):
    m = X.shape[0]
    
    d_thetha = np.dot(X.T, hypothesis(X, thetha) - y) / m

    return d_thetha

In [64]:
def gradient_descent(X, y, theta, alpha, num_iters, print_J = True):
    
    
    m = X.shape[0]
    J_history = []
    J = cost_function(X, y, theta)
    if print_J == True:
        print(J)
    J_history.append(J)
    for i in range(num_iters):

        delta = derivative(X,y,theta)/m
        theta = theta - alpha * delta
        J = cost_function(X,y,theta)

        if print_J == True:
            print(J)
        J_history.append(J)
    return theta, J_history

In [65]:
data_columns = ["year", "population", "gdpPerCapita"]
target_column = "lifeExpectancy"
X, y = load_datasets('countriesData.csv',data_columns,target_column)
print('Dataset: X={}, y={}'.format(X.shape, y.shape))

X["population"]=X["population"].div(1000000)

Dataset: X=(1704, 3), y=(1704,)


In [66]:
X_train, X_test, y_train, y_test = split_datasets(X, y, 0.1)
print('Training set: X={}, y={}'.format(X_train.shape, y_train.shape))
print('Test set: X={}, y={}'.format(X_test.shape, y_test.shape))

Training set: X=(1533, 3), y=(1533,)
Test set: X=(171, 3), y=(171,)


In [67]:
X_train_norm, mean, std = normalize(X_train)
X = prepare_X(X_train_norm)
theta = np.array([0, 0, 0, 0])

In [68]:
y = y_train[:]

In [69]:
print('X: {}'.format(X.shape))
print('y: {}'.format(y.shape))
print(cost_function(X, y, theta))

X: (1533, 4)
y: (1533,)
1840.350887924073


In [70]:
alpha = 50
num_iters = 500

In [71]:
new_theta, Js = gradient_descent(X, y, theta, alpha, num_iters, False)
print(new_theta)
print(cost_function(X, y, new_theta))

[59.27663532  4.02009541  0.78428834  6.60218   ]
47.45661138427909


In [72]:
X_test_proc = (X_test-mean)/std
X_test_proc = prepare_X(X_test_proc)
y_test_pred = hypothesis(X_test_proc, new_theta)
rel_diff = np.abs(np.divide(y_test - y_test_pred, y_test)) * 100
res = pd.DataFrame({'y_actual' : y_test, 'y_pred' : y_test_pred, 'err' : rel_diff})
print(res)
print(cost_function(X_test_proc, y_test, new_theta))

      y_actual     y_pred        err
596     77.030  68.920587  10.527604
663     70.000  55.526306  20.676706
1588    51.016  53.264186   4.406825
839     67.297  61.951563   7.943054
1549    61.800  51.782184  16.210058
175     65.205  62.250670   4.530833
1311    49.901  62.629501  25.507506
1389    75.130  69.776442   7.125726
1558    68.976  67.161618   2.630454
1173    61.818  60.718886   1.777984
1178    61.817  52.578919  14.944241
1369    67.450  53.123929  21.239543
1283    72.476  68.047392   6.110447
1169    54.043  55.063998   1.889232
1445    47.800  55.309687  15.710643
323     65.152  61.388826   5.775992
622     53.676  60.252640  12.252477
268     45.569  53.320979  17.011520
1093    70.260  57.188159  18.604955
314     44.467  51.161057  15.053989
284     74.126  62.357138  15.876834
347     55.322  63.163466  14.174227
556     38.308  53.066425  38.525700
74      69.540  57.398780  17.459333
675     69.500  57.644620  17.058100
1648    50.254  53.343105   6.146982
1