In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from gensim.models import Word2Vec

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [4]:
df = data.copy()

# Data Preprocessing

In [5]:
# Combine make and model
df['Maker_Model'] = df['Make']+" "+df['Model']

In [6]:
# Only text columns to be kept
df1 = df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size','Vehicle Style', 'Maker_Model']]
print(df1.shape)
df1.head()

(11914, 7)


Unnamed: 0,Engine Fuel Type,Transmission Type,Driven_Wheels,Market Category,Vehicle Size,Vehicle Style,Maker_Model
0,premium unleaded (required),MANUAL,rear wheel drive,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,BMW 1 Series M
1,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,Performance",Compact,Convertible,BMW 1 Series
2,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,High-Performance",Compact,Coupe,BMW 1 Series
3,premium unleaded (required),MANUAL,rear wheel drive,"Luxury,Performance",Compact,Coupe,BMW 1 Series
4,premium unleaded (required),MANUAL,rear wheel drive,Luxury,Compact,Convertible,BMW 1 Series


In [7]:
# Combine all into one column
df2 = df1.apply(lambda x: ','.join(x.astype(str)),axis = 1)
print(df2.shape)
df2.head()

(11914,)


0    premium unleaded (required),MANUAL,rear wheel ...
1    premium unleaded (required),MANUAL,rear wheel ...
2    premium unleaded (required),MANUAL,rear wheel ...
3    premium unleaded (required),MANUAL,rear wheel ...
4    premium unleaded (required),MANUAL,rear wheel ...
dtype: object

In [8]:
df_clean = pd.DataFrame({'clean':df2}) 
df_clean.head()

Unnamed: 0,clean
0,"premium unleaded (required),MANUAL,rear wheel ..."
1,"premium unleaded (required),MANUAL,rear wheel ..."
2,"premium unleaded (required),MANUAL,rear wheel ..."
3,"premium unleaded (required),MANUAL,rear wheel ..."
4,"premium unleaded (required),MANUAL,rear wheel ..."


In [9]:
df_clean.shape

(11914, 1)

# Bag-of-words(CountVectorizer and Normalized CountVectorizer

In [10]:
cv = CountVectorizer()
count_vectorized = cv.fit_transform(df_clean['clean'])

In [11]:
count_vectorized

<11914x845 sparse matrix of type '<class 'numpy.int64'>'
	with 152091 stored elements in Compressed Sparse Row format>

In [12]:
df_count_vectorized = pd.DataFrame(count_vectorized.toarray(), columns=cv.get_feature_names())
df_count_vectorized.head()

Unnamed: 0,10,100,124,12c,15,150,1500,1500hd,16,190,...,xv,yaris,yorker,yukon,z3,z4,z71,z8,zdx,zephyr
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
[i for i in list(df_count_vectorized.loc[0]) if i]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [14]:
normalized_count_occurrence = count_vectorized.toarray() / np.sum(count_vectorized.toarray(), axis=1, keepdims=True)

In [15]:
df_norm_count_occurrence = pd.DataFrame(normalized_count_occurrence, columns=cv.get_feature_names())
df_norm_count_occurrence.head()

Unnamed: 0,10,100,124,12c,15,150,1500,1500hd,16,190,...,xv,yaris,yorker,yukon,z3,z4,z71,z8,zdx,zephyr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
[i for i in list(df_norm_count_occurrence.loc[0]) if i]

[0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625,
 0.0625]

# TF-IDF

In [17]:
tfidf = TfidfVectorizer()
tfidf_vectorized = tfidf.fit_transform(df_clean['clean'])

In [18]:
df_tfidf_vectorized = pd.DataFrame(tfidf_vectorized.toarray(), columns=tfidf.get_feature_names())
df_tfidf_vectorized.head()

Unnamed: 0,10,100,124,12c,15,150,1500,1500hd,16,190,...,xv,yaris,yorker,yukon,z3,z4,z71,z8,zdx,zephyr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
[i for i in list(df_tfidf_vectorized.loc[0]) if i]

[0.396968057420812,
 0.16642291325192127,
 0.28241585962308335,
 0.08683687109616055,
 0.3432325713466415,
 0.27353043429543145,
 0.19861455446960802,
 0.20847361826660427,
 0.1931654829179385,
 0.1904558062356049,
 0.19645036490015455,
 0.23907512816427573,
 0.38648131871478714,
 0.3432325713466415,
 0.08849235830288406,
 0.08683687109616055]

# Embedding with Word2Vec

<ul>two-layer neural network</ul>
<ul>Input for this is text and output is a set of vectors. </ul>
<ul>Gensim library on the custom corpus is implemented using algorithms like CBOW(Continuous Bag of Words), SG(Skip Gram).</ul>

In [20]:
sent = [row.split(',') for row in df_clean['clean']]
sent[-2:]

[['premium unleaded (recommended)',
  'AUTOMATIC',
  'all wheel drive',
  'Crossover',
  'Hatchback',
  'Luxury',
  'Midsize',
  '4dr Hatchback',
  'Acura ZDX'],
 ['regular unleaded',
  'AUTOMATIC',
  'front wheel drive',
  'Luxury',
  'Midsize',
  'Sedan',
  'Lincoln Zephyr']]

In [21]:
model = Word2Vec(sent, min_count =1, vector_size=50, window=3, sg=1)

size: The number of dimensions of the embeddings and the default is 100.<br>
min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.<br>
workers: The number of partitions during training and the default workers is 3.<br>
window: The maximum distance between a target word and words around the target word. The default window is 5.<br>
sg: The training algorithm, either CBOW(0) or skip gram (1). The default training alogrithm is CBOW<br>

In [22]:
model.wv['Lincoln Zephyr']

array([ 0.0130883 , -0.00897469,  0.02029793, -0.02182404, -0.01794816,
       -0.02045001,  0.00767897,  0.00638522,  0.01119214, -0.01179093,
        0.00722474,  0.00327387,  0.01588829,  0.00462027, -0.01393587,
        0.0188003 , -0.01741093, -0.00865233,  0.0139042 , -0.01725176,
       -0.00200607, -0.00761845, -0.01643736,  0.01086152, -0.01362265,
        0.01912935, -0.00442427,  0.02069392, -0.00581624,  0.02018023,
        0.02172275,  0.01895268,  0.01521047, -0.01335437, -0.00760081,
       -0.01058436, -0.00161575,  0.00793566,  0.00574602, -0.00414342,
        0.01438946, -0.01662086, -0.00351258, -0.01558528, -0.00024243,
        0.00502782, -0.02212464,  0.00316941, -0.01663932, -0.00209782],
      dtype=float32)