<a href="https://colab.research.google.com/github/NetoRibeiro/DATA8001Assignment1/blob/main/Project001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Project 001
##Data Science and Analytics Course
###Linear Regression

####Data Understanding

In [270]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.colors
import numpy as np
import pandas as pd
from datetime import datetime as dt, timedelta as td
import calendar
import re
import math
import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
file_name = '/content/drive/MyDrive/Data Science and Analytics/R00206995/data/R00206995_original.csv'

In [3]:
df_original = pd.read_csv(file_name)

In [4]:
df_original.head(3)

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
1,191-C-3750,2019-Jan-20,,mazda,CX-30,SUV,#C0C0C0,B,41690
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381


In [5]:
# Find Features with missing values
df_original.isnull().any()

car_reg          False
purchase_date    False
county            True
make             False
model             True
type              True
colour           False
tax_band         False
price            False
dtype: bool

In [6]:
# Filter all missing values on MODEL
df_original.loc[df_original['model'].isnull() == True]

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381
3,191-d-2645,26 Jan,DUBLIN,Audi : Q3,,SUV,ORANGE,2,44836
6,XXX-X-4891,2018-01-06,Waterford,Opel : Astra (Estate),,,white,b,29616
9,XXX-X-4163,2019-01-02,Galway,Ford : Mondeo (Saloon),,,silver,b,38185
...,...,...,...,...,...,...,...,...,...
3790,XXX-X-814,2020-01-05,Dublin,Mercedes : C-Class (Estate),,,blue,b,49622
3793,XXX-X-576,2019-01-16,Dublin,Mercedes : C-Class (Estate),,,orange,b,52120
3794,XXX-X-3209,2020-05-08,Dublin,Toyota : Corolla (Saloon),,,white,b,29484
3797,201-l-868,27 Jan,LIMERICK,Mercedes : C-Class,,Saloon,BLUE,2,103621


##Registration ETL

In [7]:
#Create a Registration related features
df_registration = df_original[{'car_reg', 'purchase_date', 'county'}].copy()

In [8]:
#Transform features to lower case
df_registration['purchase_date'] = df_registration['purchase_date'].str.lower()
df_registration['county'] = df_registration['county'].str.lower()
df_registration['car_reg'] = df_registration['car_reg'].str.lower()

In [9]:
#Print size of the data frame an example of the first 3 rows
print(f'{df_registration.shape}\n {df_registration.tail(3)}')

(3800, 3)
          car_reg purchase_date    county
3797   201-l-868        27 jan  limerick
3798  191-d-1747   2019-mar-24       NaN
3799  xxx-x-3426    2019-07-05      cork


In [10]:
def registration_status(car_registration, looking_for):
  '''
  Purpose of that function: To get patterns on the CAR Registration
  :param car_registration: String value
  :param looking_for: Patterns, Year, County Code, Registration_number
  '''
  # Get the century for the registration
  reg_date = dt.fromordinal(dt.today().toordinal())
  reg_century = reg_date.strftime('%Y')[:2]
  reg_patterns = '[0-9]+[-]+[a-z]+[-]+[0-9]'
  reg_missing_year_county = '[x]+[-]+[x]+[-]+[0-9]'
  reg_missing_year = '[x]+[-]+[a-z]+[-]+[0-9]'
  reg_missing_county = '[0-9]+[-]+[x]+[-]+[0-9]'
  car_reg_return = ""
  try:
    
    looking_for = looking_for.lower()
    reg_year, reg_county, reg_number = car_registration.split("-")

    if looking_for == "registration_number":
      car_reg_return = reg_number

    elif (re.search(reg_patterns, car_registration)):
      
      if looking_for == "patterns":
        car_reg_return = "match"
      elif looking_for == "year":
        car_reg_return = f'{reg_century}{reg_year[:2]}'
      elif looking_for == "county":
        car_reg_return = reg_county
      else:
        car_reg_return = ""
      
    elif (re.search(reg_missing_year_county, car_registration)):
      
      if looking_for == "patterns":
        car_reg_return = "missing_year_county"
      elif looking_for == "year":
        car_reg_return = ""
      elif looking_for == "county":
        car_reg_return = ""
      else:
        car_reg_return = ""
      
    elif (re.search(reg_missing_year, car_registration)):
      
      if looking_for == "patterns":
        car_reg_return = "missing_year"
      elif looking_for == "year":
        car_reg_return = ""
      elif looking_for == "county":
        car_reg_return = reg_county
      else:
        car_reg_return = ""

    elif (re.search(reg_missing_county, car_registration)):
      
      if looking_for == "patterns":
        car_reg_return = "missing_county"
      elif looking_for == "year":
        car_reg_return = f'{reg_century}{reg_year[:2]}'
      elif looking_for == "county":
        car_reg_return = ""
      else:
        car_reg_return = ""

    else:
      car_reg_return = ""
      
    return car_reg_return
    

  except Exception as ex:
    raise Exception(f'Car registration is not matching with the expected paterns :: {ex}')   

In [11]:
# Create new column feature using lambda function
df_registration['reg_county'] = df_registration.apply(lambda row: registration_status(row['car_reg'], "COUNTY"), axis=1)
df_registration['reg_number'] = df_registration.apply(lambda row: registration_status(row['car_reg'], "Registration_number"), axis=1)

In [12]:
df_counties = df_registration.loc[(df_registration['county'].isnull() == False) & (df_registration['reg_county'] != "")][{'county', 'reg_county', 'car_reg'}].copy()

In [13]:
df_counties = df_counties.groupby(['county', 'reg_county'], as_index=False)['car_reg'].count()

In [14]:
# encoder for County and County Code
label_county = LabelEncoder()
label_reg_county = LabelEncoder()
# train the encoder for County and County Code
label_county.fit(df_counties['county'])
label_reg_county.fit(df_counties['reg_county'])

LabelEncoder()

In [15]:
# transform and add new columns with the label encoder
df_counties['county_lbl'] = label_county.transform(df_counties['county'])
df_counties['reg_county_lbl'] = label_reg_county.transform(df_counties['reg_county'])

In [16]:
# Drop unsed column
df_counties.drop(['car_reg'], axis=1, inplace=True)

In [17]:
df_county_label = df_counties[{'county', 'county_lbl'}].copy()
df_county_code_label = df_counties[{'reg_county', 'reg_county_lbl'}].copy()

In [18]:
#Merge the county label to the registration data frame
df_registration = pd.merge(df_registration, df_county_label, how='left', on='county')
df_registration = pd.merge(df_registration, df_county_code_label, how='left', on='reg_county')

In [19]:
#Fill county label missing values with the ones is not missing on county code label
df_registration['county_lbl'].fillna(df_registration['reg_county_lbl'], axis=0, inplace=True)
#Drop Reg_County_lbl
df_registration.drop(['reg_county_lbl'], axis=1, inplace=True)
#Convert to integer
df_registration['county_lbl'] = df_registration['county_lbl'].astype(int)

In [20]:
# transform and add new columns with the label encoder
df_registration['county'] = label_county.inverse_transform(df_registration['county_lbl'])
df_registration['reg_county'] = label_reg_county.inverse_transform(df_registration['county_lbl'])

In [21]:
df_registration.head(3)

Unnamed: 0,car_reg,purchase_date,county,reg_county,reg_number,county_lbl
0,xxx-x-2315,2020-07-01,cork,c,2315,0
1,191-c-3750,2019-jan-20,cork,c,3750,0
2,191-l-3155,21 mar,limerick,l,3155,3


##Purchese Date ETL

In [22]:
#Distinct dates in the original format
df_registration.purchase_date.value_counts()

13 jan         15
12 jan         15
2020-jan-15    14
21 jan         14
22 jan         13
               ..
2019-oct-19     1
2018-apr-10     1
2019-05-24      1
2020-oct-04     1
2020-08-16      1
Name: purchase_date, Length: 1231, dtype: int64

In [23]:
def get_month_num(month):
  month_dict = dict((month.lower(), index) for index, month in enumerate(calendar.month_abbr) if month)
  try:
    month_num = month_dict[month.lower()]
    return month_num
  except Exception as ex:
    raise Exception(f'Month does not matching with the expected patterns :: {ex}')  


def get_semester_num(month):
  '''
  Purpose of that function it is to convert the Month in Semester (1 or 2)
  :param month: Month from the purchase date
  '''
  try:
    
    month_num = pd.to_numeric(month)
    
    if month_num > 6:
      return 2
    
    else:
      return 1
  
  except Exception as ex:
    raise Exception(f'Month number does not matching with the expected patterns :: {ex}')  
                     

def date_patterns(purchase_dates, looking_for, registration):
  '''
  Purpose of that function: Find patterns on the date format
  :param purchase_dates: string date in original format
  :param looking_for: Patterns, Year, Month, Day
  :param registration: String registration number
  '''
  dt_patterns_yyyymmdd = '[0-9]+[-]+[0-9]+[-]+[0-9]'
  dt_patterns_yyyymmmdd = '[0-9]+[-]+[a-z]+[-]+[0-9]'
  dt_missing_year_ddmmm = '[0-9]+[-]+[a-z]'
  dt_missing_year_dd_mmm = '[0-9]+[ ]+[a-z]'
  car_purchase_date_return = ""
  try:
    looking_for = looking_for.lower()
    str(purchase_dates)
    if (re.search(dt_patterns_yyyymmdd, purchase_dates)):
      if looking_for == "patterns":
        car_purchase_date_return = "yyyy-mm-dd"
      elif looking_for == "year":
        car_purchase_date_return = purchase_dates[:4]
      elif looking_for == "month":
        car_purchase_date_return = purchase_dates[5:7]
      elif looking_for == "day":
        car_purchase_date_return = purchase_dates[8:10]
      else:
        car_purchase_date_return = ""

      
    elif (re.search(dt_patterns_yyyymmmdd, purchase_dates)):
      if looking_for == "patterns":
        car_purchase_date_return = "yyyy-mmm-dd"
      elif looking_for == "year":
        car_purchase_date_return = purchase_dates[:4]
      elif looking_for == "month":
        month = purchase_dates[5:8]
        car_purchase_date_return = get_month_num(month)
      elif looking_for == "day":
        car_purchase_date_return = purchase_dates[9:11]
      else:
        car_purchase_date_return = ""
      
      
    elif (re.search(dt_missing_year_ddmmm, purchase_dates)):
      if looking_for == "patterns":
        car_purchase_date_return = "ddmmm"
      elif looking_for == "year":
        car_purchase_date_return = registration_status(registration, "YEAR")
      elif looking_for == "month":
        month = purchase_dates[2:5]
        car_purchase_date_return = get_month_num(month)
      elif looking_for == "day":
        car_purchase_date_return = purchase_dates[:2]
      else:
        car_purchase_date_return = ""

    elif (re.search(dt_missing_year_dd_mmm, purchase_dates)):
      if looking_for == "patterns":
        car_purchase_date_return = "dd_mmm"
      elif looking_for == "year":
        car_purchase_date_return = registration_status(registration, "YEAR")
      elif looking_for == "month":
        month = purchase_dates[3:6]
        car_purchase_date_return = get_month_num(month)
      elif looking_for == "day":
        car_purchase_date_return = purchase_dates[:2]
      else:
        car_purchase_date_return = ""
      
    else:
      
      car_purchase_date_return = "verify"
      
    return car_purchase_date_return
    #return car_reg_status

  except Exception as ex:
    raise Exception(f'Purchase date is not matching with the expected patterns :: {ex}')  

In [24]:
#Apply lambda function to get year and month from purchase date
df_registration['purchase_year'] = df_registration.apply(lambda row: date_patterns(row['purchase_date'], "YEAR", row['car_reg']), axis=1)
df_registration['purchase_month'] = df_registration.apply(lambda row: date_patterns(row['purchase_date'], "MONTH", row['car_reg']), axis=1)
df_registration['purchase_day'] = df_registration.apply(lambda row: date_patterns(row['purchase_date'], "DAY", row['car_reg']), axis=1)
df_registration['purchase_month'] = pd.to_numeric(df_registration['purchase_month'])

In [25]:
#Apply lambda function to get semester from purchase date
df_registration['purchase_semester'] = df_registration.apply(lambda row: get_semester_num(row['purchase_month']), axis=1)

In [26]:
print(f'{df_registration["purchase_year"].value_counts()}\n Total:\t{df_registration["purchase_year"].value_counts().sum()}')

2020    1294
2019    1278
2018    1228
Name: purchase_year, dtype: int64
 Total:	3800


In [27]:
def return_to_patterns(col1, col2, col3, col4, pattern):
  """
  The purpose of the function:
  Concatenate string columns into the requested pattern
  :param col1: String (Year)
  :param col2: String (County or Month)
  :param col3: String (Registration Number or Day)
  :param col4: String (Semester)
  :param pattern: String to create Car Registration or Date
  :return: New string combined the columns into the requested pattern
  """
  try:
    reg_date = dt.fromordinal(dt.today().toordinal())
    reg_century = reg_date.strftime('%Y')[:2]
    pattern = pattern.lower()
    
    if pattern == "registration":
      
      try:
        if len(col1) == 4:
          year = col1[2:4]
        elif len(col1) == 2:
          year = col1
        else:
          year = ""
          
        if len(col2) == 1:
          county_code =  col2.upper()
        else:
          county_code = ""

        if len(col3) > 0:
          reg_number = col3
        else:
          reg_number = ""
          
        if 0 < col4 < 3:
          semester = col4
        else:
          semester = ""
        
        pattern_return = f'{year}{semester}-{county_code}-{reg_number}'  
        return pattern_return
        
          
      except Exception as ex:
        raise Exception(f'Error Registration Patterns: {ex}')
    elif pattern == "date":
      try:
        if len(col1) == 4:
          year = col1
        elif len(col1) == 2:
          year = f'{reg_century}{col1}'
        else:
          year = ""
          
        if col2 < 10:
          month =  f'0{col2}'
        elif col2 > 0:
          month =  col2
        else:
          month = ""

        if len(col3) == 1:
          day = f'0{col3}'
        elif len(col3) == 2:
          day = col3
        else:
          day = ""
        
        pattern_return = f'{year}-{month}-{day}'
        return pattern_return
      
      except Exception as ex:
          raise Exception(f'Error Date Patterns: {ex}')
    
    
  except Exception as ex:
    raise Exception(f'error: {ex}')

In [28]:
# Apply lambda function to get Car Registration or Date with the correct pattern
df_registration['purchase_date'] = df_registration.apply(
    lambda row: return_to_patterns(
        row['purchase_year'], row['purchase_month'], row['purchase_day'], row['purchase_semester'], "DATE"),
    axis=1)

In [29]:
df_registration['car_reg'] = df_registration.apply(
    lambda row: return_to_patterns(
        row['purchase_year'], row['reg_county'], row['reg_number'], row['purchase_semester'], "REGISTRATION"),
    axis=1)

In [30]:
df_registration = df_registration[{'car_reg', 'purchase_date', 'county'}].copy()

In [31]:
df_second = df_original[{'make',	'model',	'type',	'colour',	'tax_band',	'price'}].copy()

In [32]:
df_test = pd.DataFrame.join(df_second, df_registration)

In [33]:
df_test['colour'].value_counts()

white                      404
White                      400
WHITE                      351
<colour>White</colour>     224
red                        195
Blue                       189
blue                       185
RED                        182
Red                        181
BLUE                       177
#FFFFFF                    157
Silver                     147
silver                     144
SILVER                     131
<colour>Red</colour>       115
<colour>Blue</colour>      113
<colour>Silver</colour>     89
#0000FF                     79
#FF0000                     79
#C0C0C0                     74
ORANGE                      51
orange                      48
Orange                      39
<colour>Orange</colour>     25
#FFA500                     21
Name: colour, dtype: int64

##Tax Band ETL

In [34]:
df_original.head(3)

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
1,191-C-3750,2019-Jan-20,,mazda,CX-30,SUV,#C0C0C0,B,41690
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381


In [35]:
df_tax_band = df_original.loc[df_original['tax_band'].isnull() == False][{'tax_band'}].copy()

In [36]:
df_tax_band['tax_band'] = df_tax_band['tax_band'].str.upper()

In [37]:
set(df_tax_band['tax_band'])

{'1', '2', '3', '4', 'A', 'B', 'C', 'D'}

In [38]:
def isNumber(numb):
  is_num = bool(re.search(r'\d', numb))
  return is_num

In [39]:
df_tax_band['isNumber'] = df_tax_band['tax_band'].apply(lambda row: isNumber(row))

In [40]:
df_tax_band.head()

Unnamed: 0,tax_band,isNumber
0,B,False
1,B,False
2,2,True
3,2,True
4,A,False


In [41]:
df_tax_band_string = df_tax_band.loc[df_tax_band['isNumber'] == False ][{'tax_band', 'isNumber'}].copy()

In [42]:
df_tax_band_train = pd.DataFrame(list(string.ascii_uppercase))
df_tax_band_train.rename({0: "tax_band"}, axis=1, inplace=True)

In [43]:
df_tax_band_train.head(3)

Unnamed: 0,tax_band
0,A
1,B
2,C


In [44]:
# encoder for Tax Band
label_band = LabelEncoder()
# train the encoder for Tax Band
label_band.fit(df_tax_band_train['tax_band'])

LabelEncoder()

In [45]:
# transform and add new columns with the label encoder
# email sending to customer about the tax band
df_tax_band_string['tax_band_lbl'] = label_band.transform(df_tax_band_string['tax_band'])

In [46]:
df_tax_band_string.head(3)

Unnamed: 0,tax_band,isNumber,tax_band_lbl
0,B,False,1
1,B,False,1
4,A,False,0


In [47]:
df_band_final = df_tax_band_string.groupby(['tax_band', 'tax_band_lbl'], as_index=False)['isNumber'].count()

In [48]:
df_band_final.drop(['isNumber'], axis=1, inplace=True)
df_band_final

Unnamed: 0,tax_band,tax_band_lbl
0,A,0
1,B,1
2,C,2
3,D,3


In [49]:
df_tax_band.shape

(3800, 2)

In [50]:
df_tax_band = pd.merge(df_tax_band, df_band_final, how='left', on='tax_band')

In [51]:
df_tax_band['tax_band_lbl'].fillna(df_tax_band['tax_band'], axis=0, inplace=True)

In [52]:
df_tax_band['tax_band_lbl'] = df_tax_band['tax_band_lbl'].astype(int)

In [53]:
df_tax_band['tax_band'] = label_band.inverse_transform(df_tax_band['tax_band_lbl'])

In [54]:
df_tax_band.drop(['isNumber', 'tax_band_lbl'], axis=1, inplace=True)

In [55]:
df_original.head(3)

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
1,191-C-3750,2019-Jan-20,,mazda,CX-30,SUV,#C0C0C0,B,41690
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381


In [56]:
df_original_tax_band = df_original[{'car_reg', 'purchase_date', 'county', 'make'}].copy()
df_original_tax_band.shape, df_original.shape, df_original_tax_band.head(3)

((3800, 4),
 (3800, 9),
                  make     car_reg purchase_date    county
 0  Audi : A4 (Saloon)  XXX-X-2315    2020-07-01      Cork
 1               mazda  191-C-3750   2019-Jan-20       NaN
 2      BMW : 3 Series  191-l-3155        21 Mar  LIMERICK)

In [57]:
df_test_tax_band = pd.DataFrame.join(df_original_tax_band, df_tax_band)
print(f'{df_test_tax_band.shape}\n{df_test_tax_band.head(3)}')

(3800, 5)
                 make     car_reg purchase_date    county tax_band
0  Audi : A4 (Saloon)  XXX-X-2315    2020-07-01      Cork        B
1               mazda  191-C-3750   2019-Jan-20       NaN        B
2      BMW : 3 Series  191-l-3155        21 Mar  LIMERICK        C


##Make, Mode and Type ETL

In [209]:
df_original.head(3)

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
1,191-C-3750,2019-Jan-20,,mazda,CX-30,SUV,#C0C0C0,B,41690
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381


In [210]:
df_car_model = df_original[{'make', 'model', 'type'}].copy()
print(f'{df_car_model.shape}\n{df_car_model.head(3)}')

(3800, 3)
                 make  model    type
0  Audi : A4 (Saloon)    NaN     NaN
1               mazda  CX-30     SUV
2      BMW : 3 Series    NaN  Saloon


In [211]:
df_car_model.isna().any()

make     False
model     True
type      True
dtype: bool

In [212]:
df_car_model['model'].fillna("", axis=0, inplace=True)
df_car_model['type'].fillna("", axis=0, inplace=True)

In [213]:
df_car_model.rename(columns={'make': 'old_make'}, inplace=True)
df_car_model.rename(columns={'model': 'old_model'}, inplace=True)
df_car_model.rename(columns={'type': 'old_type'}, inplace=True)

In [220]:
def split_car_model_pattern(make, model, cartype, return_column):
  inside_make = ""
  inside_model_type = ""
  inside_model = ""
  inside_type = ""
  make_pattern_col = '[]:[]'
  make_pattern_obra = '[]([]'
  make_pattern_cbra = '[])[]'
  return_column = return_column.lower()
  
  if (re.search(make_pattern_col, make)):
    start_c, end_c = re.search(make_pattern_col, make).span()
    inside_make = make[:start_c].strip().lower()
    inside_model_type = make[end_c:].strip().lower()
    if (re.search(make_pattern_obra, inside_model_type)):
      start_o, end_o = re.search(make_pattern_obra, inside_model_type).span()
      inside_type = inside_model_type[end_o:].replace(")","").strip().lower()
      inside_model = inside_model_type[:start_o].strip().lower()
      ma_mo_ty_return = f'{inside_make}|{inside_model}|{inside_type}'
      
    else:
      inside_model = make[end_c:].strip().lower()
      inside_type = cartype.lower()
      ma_mo_ty_return = f'{inside_make}|{inside_model}|{inside_type}'
      
  else:
    inside_make = make.strip().lower()
    inside_model = model.strip().lower()
    inside_type = cartype.strip().lower()
    ma_mo_ty_return = (f'{inside_make}|{inside_model}|{inside_type}')

  if return_column == "allinone":
    return ma_mo_ty_return
  elif return_column == "make":
    return inside_make
  elif return_column == "model":
    return inside_model
  elif return_column == "type":
    return inside_type
  else:
    return ""

In [221]:
df_car_model['ma_mo_ty'] = df_car_model.apply(lambda row: split_car_model_pattern(row['old_make'], row['old_model'], row['old_type'], "AllInOne"), axis=1)

In [222]:
df_car_model['make'] = df_car_model.apply(lambda row: split_car_model_pattern(row['old_make'], row['old_model'], row['old_type'], "Make"), axis=1)
df_car_model['model'] = df_car_model.apply(lambda row: split_car_model_pattern(row['old_make'], row['old_model'], row['old_type'], "Model"), axis=1)
df_car_model['type'] = df_car_model.apply(lambda row: split_car_model_pattern(row['old_make'], row['old_model'], row['old_type'], "Type"), axis=1)

In [223]:
df_car_model.head()

Unnamed: 0,old_make,old_model,old_type,ma_mo_ty,make,model,type
0,Audi : A4 (Saloon),,,audi|a4|saloon,audi,a4,saloon
1,mazda,CX-30,SUV,mazda|cx-30|suv,mazda,cx-30,suv
2,BMW : 3 Series,,Saloon,bmw|3 series|saloon,bmw,3 series,saloon
3,Audi : Q3,,SUV,audi|q3|suv,audi,q3,suv
4,Opel,Astra,Hatchback,opel|astra|hatchback,opel,astra,hatchback


In [225]:
df_car_model_final = df_car_model[{'make', 'model', 'type'}].copy()

In [226]:
df_car_model_final.head()

Unnamed: 0,make,model,type
0,audi,a4,saloon
1,mazda,cx-30,suv
2,bmw,3 series,saloon
3,audi,q3,suv
4,opel,astra,hatchback


In [227]:
df_original.head(3)

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
1,191-C-3750,2019-Jan-20,,mazda,CX-30,SUV,#C0C0C0,B,41690
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381


In [228]:
df_original_model = df_original[{'car_reg', 'purchase_date', 'county', 'colour', 'tax_band', 'price'}].copy()

In [230]:
df_ori_model = pd.DataFrame.join(df_original_model, df_car_model_final)

In [233]:
print(f'{df_ori_model.shape}\n{df_ori_model.head(3)}')

(3800, 9)
  tax_band     car_reg    county  price  ... purchase_date   make     model    type
0        b  XXX-X-2315      Cork  55287  ...    2020-07-01   audi        a4  saloon
1        B  191-C-3750       NaN  41690  ...   2019-Jan-20  mazda     cx-30     suv
2        2  191-l-3155  LIMERICK  40381  ...        21 Mar    bmw  3 series  saloon

[3 rows x 9 columns]


##Working with Colour
### TO DO

#### research about

https://matplotlib.org/stable/api/colors_api.html?highlight=colors#module-matplotlib.colors

In [236]:
df_original.head(3)

Unnamed: 0,car_reg,purchase_date,county,make,model,type,colour,tax_band,price
0,XXX-X-2315,2020-07-01,Cork,Audi : A4 (Saloon),,,red,b,55287
1,191-C-3750,2019-Jan-20,,mazda,CX-30,SUV,#C0C0C0,B,41690
2,191-l-3155,21 Mar,LIMERICK,BMW : 3 Series,,Saloon,WHITE,2,40381


In [248]:
df_colour = df_original[{'colour'}].copy()
df_colour.head(3)

Unnamed: 0,colour
0,red
1,#C0C0C0
2,WHITE


In [252]:
df_colour.isna().any(), df_colour.isnull().any(), df_colour.loc[df_colour['colour'] == "" ].any()

(colour    False
 dtype: bool, colour    False
 dtype: bool, colour    False
 dtype: bool)

In [254]:
set(df_colour['colour'])

{'#0000FF',
 '#C0C0C0',
 '#FF0000',
 '#FFA500',
 '#FFFFFF',
 '<colour>Blue</colour>',
 '<colour>Orange</colour>',
 '<colour>Red</colour>',
 '<colour>Silver</colour>',
 '<colour>White</colour>',
 'BLUE',
 'Blue',
 'ORANGE',
 'Orange',
 'RED',
 'Red',
 'SILVER',
 'Silver',
 'WHITE',
 'White',
 'blue',
 'orange',
 'red',
 'silver',
 'white'}

In [255]:
color_hex = '#0000FF'

In [264]:
color_hex
hex_pattern = r'#[a-fA-F0-9]{3}(?:[a-fA-F0-9]{3})?$'

(0, 0, 255)

In [275]:
colors_mat = matplotlib.colors

In [276]:
colors_mat.to_rgb(color_hex)

(0.0, 0.0, 1.0)

In [282]:
colors_mat.is_color_like(color_hex)

True

In [286]:
if color_hex in colors_mat.get_named_colors_mapping():
  print(colors_mat.to_rgb(color_hex))

In [288]:
colors_mat.to_rgb(color_hex)

(0.0, 0.0, 1.0)

In [289]:
colors_mat.to_hex('blue')

'#0000ff'

In [291]:
table_colours = colors_mat.TABLEAU_COLORS

In [310]:
table_of_colours = colors_mat.get_named_colors_mapping()

In [312]:
df_colourstable = pd.DataFrame(table_of_colours)

In [313]:
df_colourstable.head()

Unnamed: 0,xkcd:cloudy blue,xkcd:dark pastel green,xkcd:dust,xkcd:electric lime,xkcd:fresh green,xkcd:light eggplant,xkcd:nasty green,xkcd:really light blue,xkcd:tea,xkcd:warm purple,xkcd:yellowish tan,xkcd:cement,xkcd:dark grass green,xkcd:dusty teal,xkcd:grey teal,xkcd:macaroni and cheese,xkcd:pinkish tan,xkcd:spruce,xkcd:strong blue,xkcd:toxic green,xkcd:windows blue,xkcd:blue blue,xkcd:blue with a hint of purple,xkcd:booger,xkcd:bright sea green,xkcd:dark green blue,xkcd:deep turquoise,xkcd:green teal,xkcd:strong pink,xkcd:bland,xkcd:deep aqua,xkcd:lavender pink,xkcd:light moss green,xkcd:light seafoam green,xkcd:olive yellow,xkcd:pig pink,xkcd:deep lilac,xkcd:desert,xkcd:dusty lavender,xkcd:purpley grey,...,seashell,sienna,silver,skyblue,slateblue,slategray,slategrey,snow,springgreen,steelblue,tan,teal,thistle,tomato,turquoise,violet,wheat,white,whitesmoke,yellow,yellowgreen,tab:blue,tab:orange,tab:green,tab:red,tab:purple,tab:brown,tab:pink,tab:gray,tab:olive,tab:cyan,tab:grey,b,g,r,c,m,y,k,w
0,#acc2d9,#56ae57,#b2996e,#a8ff04,#69d84f,#894585,#70b23f,#d4ffff,#65ab7c,#952e8f,#fcfc81,#a5a391,#388004,#4c9085,#5e9b8a,#efb435,#d99b82,#0a5f38,#0c06f7,#61de2a,#3778bf,#2242c7,#533cc6,#9bb53c,#05ffa6,#1f6357,#017374,#0cb577,#ff0789,#afa88b,#08787f,#dd85d7,#a6c875,#a7ffb5,#c2b709,#e78ea5,#966ebd,#ccad60,#ac86a8,#947e94,...,#FFF5EE,#A0522D,#C0C0C0,#87CEEB,#6A5ACD,#708090,#708090,#FFFAFA,#00FF7F,#4682B4,#D2B48C,#008080,#D8BFD8,#FF6347,#40E0D0,#EE82EE,#F5DEB3,#FFFFFF,#F5F5F5,#FFFF00,#9ACD32,#1f77b4,#ff7f0e,#2ca02c,#d62728,#9467bd,#8c564b,#e377c2,#7f7f7f,#bcbd22,#17becf,#7f7f7f,0,0.0,1,0.0,0.75,0.75,0,1
1,#acc2d9,#56ae57,#b2996e,#a8ff04,#69d84f,#894585,#70b23f,#d4ffff,#65ab7c,#952e8f,#fcfc81,#a5a391,#388004,#4c9085,#5e9b8a,#efb435,#d99b82,#0a5f38,#0c06f7,#61de2a,#3778bf,#2242c7,#533cc6,#9bb53c,#05ffa6,#1f6357,#017374,#0cb577,#ff0789,#afa88b,#08787f,#dd85d7,#a6c875,#a7ffb5,#c2b709,#e78ea5,#966ebd,#ccad60,#ac86a8,#947e94,...,#FFF5EE,#A0522D,#C0C0C0,#87CEEB,#6A5ACD,#708090,#708090,#FFFAFA,#00FF7F,#4682B4,#D2B48C,#008080,#D8BFD8,#FF6347,#40E0D0,#EE82EE,#F5DEB3,#FFFFFF,#F5F5F5,#FFFF00,#9ACD32,#1f77b4,#ff7f0e,#2ca02c,#d62728,#9467bd,#8c564b,#e377c2,#7f7f7f,#bcbd22,#17becf,#7f7f7f,0,0.5,0,0.75,0.0,0.75,0,1
2,#acc2d9,#56ae57,#b2996e,#a8ff04,#69d84f,#894585,#70b23f,#d4ffff,#65ab7c,#952e8f,#fcfc81,#a5a391,#388004,#4c9085,#5e9b8a,#efb435,#d99b82,#0a5f38,#0c06f7,#61de2a,#3778bf,#2242c7,#533cc6,#9bb53c,#05ffa6,#1f6357,#017374,#0cb577,#ff0789,#afa88b,#08787f,#dd85d7,#a6c875,#a7ffb5,#c2b709,#e78ea5,#966ebd,#ccad60,#ac86a8,#947e94,...,#FFF5EE,#A0522D,#C0C0C0,#87CEEB,#6A5ACD,#708090,#708090,#FFFAFA,#00FF7F,#4682B4,#D2B48C,#008080,#D8BFD8,#FF6347,#40E0D0,#EE82EE,#F5DEB3,#FFFFFF,#F5F5F5,#FFFF00,#9ACD32,#1f77b4,#ff7f0e,#2ca02c,#d62728,#9467bd,#8c564b,#e377c2,#7f7f7f,#bcbd22,#17becf,#7f7f7f,1,0.0,0,0.75,0.75,0.0,0,1


In [316]:
table_of_colours['magenta']

'#FF00FF'

In [320]:
table_colours.keys()

odict_keys(['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'])

In [325]:
table_colours.items()

odict_items([('tab:blue', '#1f77b4'), ('tab:orange', '#ff7f0e'), ('tab:green', '#2ca02c'), ('tab:red', '#d62728'), ('tab:purple', '#9467bd'), ('tab:brown', '#8c564b'), ('tab:pink', '#e377c2'), ('tab:gray', '#7f7f7f'), ('tab:olive', '#bcbd22'), ('tab:cyan', '#17becf')])

In [328]:
table_colours.values()

odict_values(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'])

In [336]:
colors_mat.TABLEAU_COLORS

OrderedDict([('tab:blue', '#1f77b4'),
             ('tab:orange', '#ff7f0e'),
             ('tab:green', '#2ca02c'),
             ('tab:red', '#d62728'),
             ('tab:purple', '#9467bd'),
             ('tab:brown', '#8c564b'),
             ('tab:pink', '#e377c2'),
             ('tab:gray', '#7f7f7f'),
             ('tab:olive', '#bcbd22'),
             ('tab:cyan', '#17becf')])

In [338]:
col_blue = 'blue'

In [339]:
for col in colors_mat.TABLEAU_COLORS:
  if col_blue in col:
    print(col)

tab:blue


In [350]:
for col in colors_mat.TABLEAU_COLORS:
  hex = colors_mat.TABLEAU_COLORS[col]
  print(hex)
  if col_blue in col:
    print(col)

#1f77b4
tab:blue
#ff7f0e
#2ca02c
#d62728
#9467bd
#8c564b
#e377c2
#7f7f7f
#bcbd22
#17becf


In [347]:
colors_mat.TABLEAU_COLORS['tab:blue']

'#1f77b4'