In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
!pip install -q kaggle

In [30]:
!mkdir ~/.kaggle
!chmod 600 ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [33]:
!cp "/content/drive/MyDrive/CHALLENGE_MOTTU/kaggle (2).json" ~/.kaggle/kaggle.json

In [34]:
!kaggle datasets download -d nehalbirla/motorcycle-dataset

Dataset URL: https://www.kaggle.com/datasets/nehalbirla/motorcycle-dataset
License(s): DbCL-1.0
Downloading motorcycle-dataset.zip to /content
  0% 0.00/12.2k [00:00<?, ?B/s]
100% 12.2k/12.2k [00:00<00:00, 20.4MB/s]


In [35]:
!unzip /content/motorcycle-dataset.zip -d /content/motorcycle_dataset

Archive:  /content/motorcycle-dataset.zip
  inflating: /content/motorcycle_dataset/BIKE DETAILS.csv  


In [36]:
!ls /content/motorcycle_dataset

'BIKE DETAILS.csv'


In [41]:
import pandas as pd
import os

print(os.listdir('/content/motorcycle_dataset'))

csv_path = '/content/motorcycle_dataset/BIKE DETAILS.csv'
df = pd.read_csv(csv_path)

print(df.head())

df.info()

['BIKE DETAILS.csv']
                                  name  selling_price  year seller_type  \
0            Royal Enfield Classic 350         175000  2019  Individual   
1                            Honda Dio          45000  2017  Individual   
2  Royal Enfield Classic Gunmetal Grey         150000  2018  Individual   
3    Yamaha Fazer FI V 2.0 [2016-2018]          65000  2015  Individual   
4                Yamaha SZ [2013-2014]          20000  2011  Individual   

       owner  km_driven  ex_showroom_price  
0  1st owner        350                NaN  
1  1st owner       5650                NaN  
2  1st owner      12000           148114.0  
3  1st owner      23000            89643.0  
4  2nd owner      21000                NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061 entries, 0 to 1060
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1061 non-null   object 
 1   sel

In [42]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove caracteres especiais e pontuação
    return text

df['processed_name'] = df['name'].apply(preprocess_text)
print(df[['name', 'processed_name']].head())

                                  name                       processed_name
0            Royal Enfield Classic 350            royal enfield classic 350
1                            Honda Dio                            honda dio
2  Royal Enfield Classic Gunmetal Grey  royal enfield classic gunmetal grey
3    Yamaha Fazer FI V 2.0 [2016-2018]        yamaha fazer fi v 20 20162018
4                Yamaha SZ [2013-2014]                   yamaha sz 20132014


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inicializa o vetorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer()

# Ajusta e transforma os nomes processados
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_name'])

# A matriz TF-IDF
print(tfidf_matrix.shape)

(1061, 264)


In [44]:
from sklearn.metrics.pairwise import cosine_similarity

# Calcula a matriz de similaridade de cosseno
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# A matriz de similaridade
print(cosine_sim.shape)

(1061, 1061)


In [45]:
def get_similar_bikes(bike_name, cosine_sim=cosine_sim, df=df):
    # Obtém o índice da moto pelo nome original
    indices = df[df['name'] == bike_name].index
    if len(indices) == 0:
        return f"Moto '{bike_name}' não encontrada no dataset."
    bike_index = indices[0]

    # Obtém os scores de similaridade de todas as motos com a moto dada
    similarity_scores = list(enumerate(cosine_sim[bike_index]))

    # Ordena as motos por score de similaridade (em ordem decrescente)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Obtém os scores das 10 motos mais similares (excluindo a própria moto)
    similarity_scores = similarity_scores[1:11]

    # Obtém os índices das motos similares
    similar_bike_indices = [i[0] for i in similarity_scores]

    # Retorna os nomes das motos similares
    return df['name'].iloc[similar_bike_indices]

# Exemplo: Encontrar motos similares a 'Royal Enfield Classic 350'
similar_bikes = get_similar_bikes('Royal Enfield Classic 350')
print(f"Motos similares a 'Royal Enfield Classic 350':\n{similar_bikes}")

Motos similares a 'Royal Enfield Classic 350':
101    Royal Enfield Classic 350
127    Royal Enfield Classic 350
142    Royal Enfield Classic 350
168    Royal Enfield Classic 350
179    Royal Enfield Classic 350
221    Royal Enfield Classic 350
297    Royal Enfield Classic 350
313    Royal Enfield Classic 350
316    Royal Enfield Classic 350
328    Royal Enfield Classic 350
Name: name, dtype: object


In [46]:
from sklearn.cluster import KMeans
import numpy as np

# Define o número de clusters (você pode ajustar este valor)
n_clusters = 5

# Aplica o K-Means aos vetores TF-IDF
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(tfidf_matrix)

# Adiciona os clusters ao DataFrame
df['cluster'] = clusters

# Exibe alguns exemplos de motos em cada cluster
for i in range(n_clusters):
    print(f"\nCluster {i}:")
    print(df[df['cluster'] == i]['name'].head())


Cluster 0:
14                          Jawa 42
15    Suzuki Access 125 [2007-2016]
20             Hero Passion Pro 110
23                    Jawa Standard
26            TVS Apache RTR 160 4V
Name: name, dtype: object

Cluster 1:
0                Royal Enfield Classic 350
2      Royal Enfield Classic Gunmetal Grey
7     Royal Enfield Bullet 350 [2007-2011]
18     Royal Enfield Classic Gunmetal Grey
24           Royal Enfield Thunderbird 350
Name: name, dtype: object

Cluster 2:
9           Bajaj Discover 125
12    Bajaj Avenger Street 220
40           Bajaj Dominar 400
46          Bajaj Discover 125
50          Bajaj Discover 100
Name: name, dtype: object

Cluster 3:
3     Yamaha Fazer FI V 2.0 [2016-2018]
4                 Yamaha SZ [2013-2014]
10                          Yamaha FZ16
13                        Yamaha YZF R3
17                     Yamaha YZF R15 S
Name: name, dtype: object

Cluster 4:
1                  Honda Dio
5           Honda CB Twister
6       Honda CB Hornet 160R

In [53]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re

# Carregar o DataFrame
df = pd.read_csv('/content/motorcycle_dataset/BIKE DETAILS.csv')

# Função de pré-processamento
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Criar a coluna 'processed_name'
df['processed_name'] = df['name'].apply(preprocess_text)

# Inicializar e ajustar o vetorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df['processed_name'])

# Salvar o vetorizador
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

from google.colab import files
files.download('tfidf_vectorizer.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [52]:
import sklearn
import pandas as pd
import joblib
print(f"scikit-learn version: {sklearn.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"joblib version: {joblib.__version__}")

scikit-learn version: 1.6.1
pandas version: 2.2.2
joblib version: 1.5.0
