In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


# Load data
df = pd.read_csv('road_accidents.csv')


sentences = df['Weather_Conditions'].values
labels = df['Road_Surface'].values



# Load stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')


# Preprocess
nltk.download('punkt')
sentences = []
for sent in df['Weather_Conditions']:
  sent = sent.lower()
  sent = re.sub(r'\W',' ', sent)
  sent = nltk.word_tokenize(sent)

  # Filter stopwords
  sent = [word for word in sent if word not in stop_words]

  sent = [PorterStemmer().stem(word) for word in sent]
  sentences.append(" ".join(sent))


# Vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


# Train model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)


# Predictions
y_pred = knn.predict(X_test)


# Evaluate
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

         Dry       0.73      1.00      0.84        24
 Frost / Ice       0.00      0.00      0.00         3
  Wet / Damp       1.00      0.54      0.70        13

    accuracy                           0.78        40
   macro avg       0.58      0.51      0.51        40
weighted avg       0.76      0.78      0.73        40



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
