<p><img alt="Colaboratory logo" height="45px" src="/img/colab_favicon.ico" align="left" hspace="10px" vspace="0px"></p>

<h1>Что такое Colaboratory?</h1>

Colaboratory, или просто Colab, позволяет писать и выполнять код Python в браузере. При этом:
- не требуется никакой настройки;
- вы получаете бесплатный доступ к графическим процессорам;
- предоставлять доступ к документам другим людям очень просто.

Это отличное решение для <strong>студентов</strong>, <strong>специалистов по обработке данных</strong> и <strong>исследователей в области искусственного интеллекта</strong>. Чтобы узнать больше, посмотрите <a href="https://www.youtube.com/watch?v=inN8seMm7UI">ознакомительное видео</a> или начните работу с инструментом ниже.

In [None]:
import random
import math
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt


from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/toyota.csv', delimiter=';')
print(data.columns)

Mounted at /content/drive
Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')


In [None]:
data.shape

(6738, 9)

In [None]:
data.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,GT86,2016,16000,Manual,24089,Petrol,265,36.2,2.0
1,GT86,2017,15995,Manual,18615,Petrol,145,36.2,2.0
2,GT86,2015,13998,Manual,27469,Petrol,265,36.2,2.0
3,GT86,2017,18998,Manual,14736,Petrol,150,36.2,2.0
4,GT86,2017,17498,Manual,36284,Petrol,145,36.2,2.0


In [None]:
unique_values_data = pd.DataFrame(data.select_dtypes(include=['object']).copy().apply(pd.unique))
unique_values_data

Unnamed: 0,0
model,"[ GT86, Corolla, RAV4, Yaris, Auris, Aygo..."
transmission,"[Manual, Automatic, Semi-Auto, Other]"
fuelType,"[Petrol, Other, Hybrid, Diesel]"


In [None]:
dropedna_data = data.dropna()
dropedna_data = dropedna_data.apply(lambda x:  pd.to_numeric(x, errors = 'ignore'))

In [None]:
def _color_red_or_green(val):
    color = 'red' if abs(val) < 0.25 else 'green'
    return 'color: %s' % color

In [None]:
dropedna_data.corr().style.applymap(_color_red_or_green)

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
year,1.0,0.420959,-0.727749,0.283455,-0.020607,-0.045567
price,0.420959,1.0,-0.297936,0.214498,-0.03991,0.727592
mileage,-0.727749,-0.297936,1.0,-0.205285,0.044889,0.182615
tax,0.283455,0.214498,-0.205285,1.0,-0.430292,0.155727
mpg,-0.020607,-0.03991,0.044889,-0.430292,1.0,-0.106868
engineSize,-0.045567,0.727592,0.182615,0.155727,-0.106868,1.0


In [None]:
data = data.drop(['tax', 'mpg'], axis = 1)
data.shape

(6738, 7)

In [None]:
category_data = data.select_dtypes(include=['object']).copy()
data = data.drop(list(category_data.columns), axis = 1)
category_data.head()

Unnamed: 0,model,transmission,fuelType
0,GT86,Manual,Petrol
1,GT86,Manual,Petrol
2,GT86,Manual,Petrol
3,GT86,Manual,Petrol
4,GT86,Manual,Petrol


In [None]:
category_data = pd.get_dummies(category_data, columns = ['model', 'transmission', 'fuelType'])
category_data.head()

Unnamed: 0,model_ Auris,model_ Avensis,model_ Aygo,model_ C-HR,model_ Camry,model_ Corolla,model_ GT86,model_ Hilux,model_ IQ,model_ Land Cruiser,...,model_ Verso-S,model_ Yaris,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [None]:
category_data.shape

(6738, 26)

In [None]:
data = pd.concat([category_data, data], axis = 1)
data.shape

(6738, 30)

In [None]:
data.loc[(data.year > 3), 'year'] = data.year / 10000
data.loc[(data.mileage > 3), 'mileage'] = data.mileage / 100000
data.loc[(data.engineSize > 0.99), 'engineSize'] = data.engineSize / 10
data.loc[(data.price > 500), 'price'] = data.price / 10000
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6728,6729,6730,6731,6732,6733,6734,6735,6736,6737
model_ Auris,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ Avensis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ Aygo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ C-HR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ Camry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ Corolla,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ GT86,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ Hilux,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
model_ IQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
model_ Land Cruiser,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X = data.drop(['price'], axis = 1)
y = data['price']

In [None]:
def initialize_with_zeros(X):
    np.random.seed(0)
    dim = X.shape[1]
    w = np.array([random.uniform(-0.001, 0.001) for i in range(dim)])
    #np.array([random.uniform(-10, 10) for i in range(dim)])
    b = 0
    #b = b.reshape(dim,1)
    return w, b

In [None]:
w1 = np.array([1,2,3,4,5,6])
b = np.array([1,1,1,1,1,1])
x1 = np.array([[1,2,3,4,5,6], [1,2,3,4,5,6]])
print(np.sum(x1[0]*w1+b))
print(x1[0]*w1)

97
[ 1  4  9 16 25 36]


In [None]:
def optimize(w, b, X, Y, num_iterations, learning_rate):
    costs = []
    
    for i in range(num_iterations):
      for j in range(len(X_train)):
        # Вычисление градиента и функции стоимости
        m = X.shape[1]
        A = np.sum(w*X[j]) + b
        cost = (A-Y[j]) ** 2
        
        # BACKWARD PROPAGATION (TO FIND GRAD)
        dw = 2*X[j]*(A-Y[j])
        db = 2*(A-Y[j])
        if j<10 and i == num_iterations-1:
          #print(w)
          print('Min= ', min(w))
          print('Max= ', max(w))
          print('B= ', b)
        
        # обновление весов
        w -= learning_rate * dw
        b -= learning_rate * db
        
      if i % 10 == 0:
            costs.append(cost)
        
      if i % 10 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [None]:
k = int((X).shape[0]*0.7)
X_train = np.array(X[:k])
Y_train = np.array(y[:k])
X_test = np.array(X[k+1:])
Y_test = np.array(y[k+1:])
print(len(X_train[1]))

29


In [None]:
w, b = initialize_with_zeros(X_train)
params, grads, costs = optimize(w, b, X_train, Y_train, num_iterations = 150, learning_rate = 0.0005)

Cost after iteration 0: 0.138940
Cost after iteration 10: 0.045114
Cost after iteration 20: 0.019585
Cost after iteration 30: 0.012626
Cost after iteration 40: 0.010608
Cost after iteration 50: 0.009964
Cost after iteration 60: 0.009736
Cost after iteration 70: 0.009642
Cost after iteration 80: 0.009593
Cost after iteration 90: 0.009562
Cost after iteration 100: 0.009540
Cost after iteration 110: 0.009524
Cost after iteration 120: 0.009513
Cost after iteration 130: 0.009504
Cost after iteration 140: 0.009498
Min=  -1.1373907816785815
Max=  0.6979825548707643
B=  0.9115971083863477
Min=  -1.1374653684799303
Max=  0.6979206288453181
B=  0.9112874782591165
Min=  -1.1375344549324973
Max=  0.6978464021973817
B=  0.9109163450194341
Min=  -1.1376631607083911
Max=  0.6977526923601878
B=  0.9104477958334646
Min=  -1.1376795913359918
Max=  0.6977303923770164
B=  0.9103362959176078
Min=  -1.1376853587868252
Max=  0.6977272133169165
B=  0.9103204006171082
Min=  -1.137758678761099
Max=  0.697672738

In [None]:
pw = params["w"]
pb = params["b"]
Y_pred = Y_train
print(X_train.shape)
Y_pred = np.array([np.sum(pw*X_train[i])+pb for i in range(len(X_train))])
print(pw.shape)
print(pb.shape)
for i in range(len(pw)):
  print(Y_pred[i]*10000, '       ', Y_train[i]*10000)
  #, '          ', pw, '          ', pb

(4716, 29)
(29,)
()
19096.35331926177         16000.0
19719.14468205197         15995.0
18711.726378771014         13998.0
20160.328919921867         18998.0
17709.532849382933         17498.0
18774.676027431808         15998.000000000002
20647.12153092552         18522.0
20432.84179281083         18995.0
21778.25503412658         27998.0
17514.277333499907         13990.0
13646.527705682296         10495.000000000002
20403.611488832335         17990.0
17730.460382192436         16995.0
21723.577945159566         23995.000000000004
17829.83595156236         18498.0
21637.593082094434         23980.0
19966.06682188583         17995.0
18935.590202034822         12998.0
22151.366520708285         23495.0
21254.07329604203         25780.0
22492.934906800932         26995.0
22559.5010242194         23998.0
21531.931788724713         26995.0
20204.79992714206         17000.0
20070.788011049484         19995.0
21495.733341198298         31000.0
20449.436679217306         15995.0
18284.4411072

In [None]:
print('MAPE Train= ', np.mean(abs((Y_train-Y_pred)/Y_train)))

MAPE Train=  0.13133898649768494


In [None]:
#Проверим тестовые данные
Y_pred_test= np.array([np.sum(pw*X_test[i])+pb for i in range(len(X_test))])
for i in range(len(pw)):
  print(Y_pred_test[i]*10000, '       ', Y_test[i]*10000)

8368.168926634362         8495.0
6837.740009279616         7995.0
9605.178750153636         9995.0
8359.183735917239         9295.0
8215.431340374758         8295.0
7454.222545904312         7695.0
9705.79476609698         8495.0
9509.041118982608         8995.0
7436.593374244134         7695.0
6478.703801843962         7495.000000000001
5244.914514272982         6899.999999999999
7332.099658263189         6720.0
7646.353860179797         7195.0
8826.962780620719         9195.0
5655.933299974707         5995.0
9148.041178018686         9000.0
8040.792359002381         8300.0
8229.906098952184         8295.0
5978.831482834373         8495.0
7326.268880628819         6490.0
9241.760129296024         8480.0
8249.779790964269         7995.0
7785.112501634109         6995.0
5681.296559847095         7500.0
9381.903165390326         9495.0
5189.613275415137         7295.0
7838.538488096212         7995.0
9637.108520849455         11900.0
8823.49736923911         7450.0


In [None]:
print('MAPE Test= ', np.mean(abs((Y_test-Y_pred_test)/Y_test)))

MAPE Test=  0.2406416813511642
