# FINAL PROJECT - OCCASION VEHICLES PRICE ESTIMATOR
___
#### MASTER IN DATA SCIENCE - KSCHOOL - 2016/17
#### KOLDO PINA ORTIZ
____

## Motivation

The aim of the present work is to get a good estimator of the price of a second-hand vehicle, based on the prices of the second-hand market

## Action Plan

To reach our goal, we will follow the the next steps:

1. ***Scrape*** website, **motos.net** to obtain the ***data***.
2. ***Clean*** the ***data***.
2. ***Merge*** some ***data***.
3. ***Train*** various models.
4. Compare the metrics and choose the model with the best one.
5. Create a flask web server with the app.

## Scrape

We are going to scrape each website separately.
To this end, we have developed two python scripts called ***scraper_motos.py*** and ***scrapers_coches.py***.
Both return a dataframe.

You can import them to the notebook.

In [None]:
%pwd

In [None]:
%cd scrapers

In [None]:
#Call the scraper_motos function to scrape motos.net and create a csv file with the raw data in scraped_data folder
#To do that we need to change our path
# %cd scrapers
# from scraper_motos import scraper_motos
# scraper_motos()

In [None]:
#Start time: 2017-06-13 19:28:56.441707
#num_ads 26588
#End time: 2017-06-13 22:47:15.427469

In [None]:
%cd ..
%pwd

In [None]:
import pandas as pd
df_motos_raw = pd.read_csv('scraped_data/test/motos_raw_data.csv')
df_motos_raw.shape

In [None]:
df_motos_raw.head()

## Data cleaning

In [None]:
df_motos_raw['color'].fillna('not specified', inplace=True)

In [None]:
# Convert the dataframe to lower case
df_motos_raw = df_motos_raw.apply(lambda x: x.astype(str).str.lower())

In [None]:
#Join the words in the 'model' and 'type' fields with an underscore
df_motos_raw['model'] = df_motos_raw['model'].str.replace(' ', '_')
df_motos_raw['type'] = df_motos_raw['type'].str.replace(' ', '_')

In [None]:
# Looking for duplicates
df_motos_raw['is_duplicated'] = df_motos_raw.duplicated()
duplicates = df_motos_raw['is_duplicated'].sum()
print '%d duplicates' %duplicates

In [None]:
# Removing duplicates and delete 'is_duplicated' column
df_motos_raw = df_motos_raw.loc[df_motos_raw['is_duplicated']==False]
df_motos_raw = df_motos_raw.drop('is_duplicated', 1)
df_motos_raw.shape

In [None]:
# Lets investigate column by column the NaNs we have in the dataframe
for column in df_motos_raw.columns:
    n_nan = df_motos_raw[column]=='nan'
    print column + " %d -- > %f" %(n_nan.sum(), (n_nan.sum()*1.0)/df_motos_raw.shape[0]*100)

In [None]:
# Lets investigate the unique values we have in the columns
for column in ['city', 'brand', 'model', 'type', 'color', 'year']:
    column_uv = df_motos_raw[column].unique()
    print column + " --> " + "%d unique values" %len(sorted(column_uv))

## Merge some data

In [None]:
#In order to calculate our first metric, we will use the following columns:
# "lon" and "lat" : These are the longitude and latitude of the corresponding city. We will add them later.
# "brand", "model", "type", "year"

In [None]:
# Calculating the longitude and latitude of the cities
import geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim()

In [None]:
cities = df_motos_raw['city'].unique()
locations_rows = []
for city in cities:
    location = geolocator.geocode([city], timeout = 15)
    locations_rows.append([city, location.latitude, location.longitude])
#Save into a csv
df_locations = pd.DataFrame(locations_rows, columns = ['city', 'lat', 'lon'])
df_locations.to_csv('auxiliary_data/locations_coords.csv', index = False)

In [None]:
df_locations.head(3)

In [None]:
# Merge df_locations with df_motos_raw
df_motos_raw_coord = pd.merge(df_motos_raw, df_locations, on = 'city')
#Save into a csv
df_motos_raw_coord.to_csv('df_motos_raw_coord.csv', index = False)

In [None]:
# We have created two csv files with a rank for the motos brands and types
# called rank_motos_brands.csv and rank_motos_types.csv

# With the first one, rank_motos_brands.csv, we are gint to create another  column, with a score to the corresponding brand
df_rank_moto_brand = pd.read_csv('rank_moto_brands.csv', sep=';')
df_motos_raw_coord_brand = pd.merge(df_motos_raw_coord, df_rank_moto_brand, on = 'brand', how = 'left')
#If the brand does not exist, the rank value will be zero
#!!OJO, AÑADIR MARCAS DE MOTOS QUE FALTAN ANTES DE PONER UN CERO!!!!!!!!!!!!!!!!!
df_motos_raw_coord_brand.brand_score.fillna(0, inplace=True)
# Save into a csv
df_motos_raw_coord_brand.to_csv('df_motos_coord_brand.csv', index = False)

# With the second one, rank_moto_types.csv, we are going to create another column, with a score to the corresponding type
df_rank_moto_type = pd.read_csv('rank_moto_types.csv', sep=';')
df_motos_raw_coord_brand_types = pd.merge(df_motos_raw_coord_brand, df_rank_moto_type, on = 'type', how = 'left')
#Save into a csv
df_motos_raw_coord_brand_types.to_csv('df_motos_raw_coord_brand_type.csv', index = False)


In [None]:
df_motos_raw_coord_brand_types.head(3)

In [None]:
df_motos_raw_coord_brand_types.shape

In [None]:
# OK!, so we have a first version of the data we will use to recommend vehicles
# df_motos_raw_coord_brand_types
# Lets try to calculate the metric only with some fields. We are going to add these distances:
# cities distance, brand_distance, type_distance, year_distance
# We need to create some functions:

def cities_distance(city_lat, city_lon, user_lat, user_lon):
    """    
    :param city_lat: the value in the dataset's lat column to the corresponding city
    :param city_lon: the value in the dataset's lon column to the corresponding city
    :param user_lat: The corresponding lat value in the location dataset of the city selected by the user
    :param user_lon: The corresponding lon value in the location dataset of the city selected by the user
    
    :return: The value in kilometers of the distance between the two cities.
    
    Usage of the Vicenty distance
    """
    
    from geopy.distance import vincenty
    
    column_city = (city_lat, city_lon)
    user_city = (user_lat, user_lon)
    
    return (vincenty(column_city, user_city).km)

def distance_abs_value(a_value, b_value):
    return abs(a_value - b_value)

def w_s(city_row, brand_row, type_row, year_row):
    import numpy as np
    weigth = 100
    brand_weight = 40
    type_weight = 40
    year_weight = 10
    city_weight = 10
    
    params = np.array([city_row, brand_row, type_row, year_row])
    weights = np.array([city_weight, brand_weight, type_weight, year_weight])
    
    num = sum(params * weights) * 1.0
    return num/weigth

In [None]:
#Example of request of a user
user_request = ('leon', 'bmw', 'custom', 2000)

# We need to calculate some variables:
user_lat = float(df_locations[df_locations['city'] == user_request[0]].lat)
user_lon = float(df_locations[df_locations['city'] == user_request[0]].lon)
user_brand = int(df_rank_moto_brand[df_rank_moto_brand['brand'] == user_request[1]].brand_score)
user_type = int(df_rank_moto_type[df_rank_moto_type['type'] == user_request[2]].type_score)
user_year = user_request[3]

user_vars = [user_brand, user_type, user_year]
score_columns = ['brand_score', 'type_score', 'year']

In [None]:
df_motos_raw_coord_brand_types['city_metric'] = df_motos_raw_coord_brand_types.apply(lambda row: cities_distance(row['lat'], row['lon'], user_lat, user_lon), axis=1)

for i, element in enumerate(['brand', 'type', 'year']):
    new_column = element + '_metric'
    print new_column, score_columns[i], user_vars[i]
    df_motos_raw_coord_brand_types[new_column] = df_motos_raw_coord_brand_types.apply(lambda row: distance_abs_value(int(row[score_columns[i]]), user_vars[i]), axis=1)
    
#!!! Revisar ranking de motos, falta alguna, por eso aparenden NAN en la columna brand_score al hacer el merge

df_motos_raw_coord_brand_types['total_metric_pond'] = df_motos_raw_coord_brand_types.apply(lambda row: w_s(row['city_metric'], row['brand_metric'], row['type_metric'], row['year_metric']), axis = 1)

In [None]:
results = df_motos_raw_coord_brand_types.sort_values(by = ['total_metric_pond'], ascending=True)

In [None]:
results.head(5)

In [None]:
brand_null = df_motos_raw_coord_brand_types[df_motos_raw_coord_brand_types['brand_score'].isnull()]

In [None]:
for element in brand_null.brand.unique():
    print element

In [None]:
import numpy as np

params = np.array([2, 4, 10, 2])
weights = np.array([10, 40, 40, 10])


In [None]:
sum(params * weights)