In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# Load the Wine Reviews dataset
wine_data = pd.read_csv('data.csv')

# Let's first preprocess the data, selecting relevant features and handling missing values
# For simplicity, we'll focus on the 'description' and 'points' columns
wine_data = wine_data[['description', 'points']].dropna()

# Define X and y
X = wine_data['description']
y = wine_data['points'] > 90  # Convert points to binary label: 1 if points > 90, else 0

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)
y_pred_nb = naive_bayes.predict(X_test_tfidf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", accuracy_nb)

# Decision Tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_tfidf, y_train)
y_pred_dt = decision_tree.predict(X_test_tfidf)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)


Naive Bayes Accuracy: 0.8578811369509044
Decision Tree Accuracy: 0.8969721062744318


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed

# Load the Wine Reviews dataset
wine_data = pd.read_csv('winemag-data-130k-v2.csv')

# Let's first preprocess the data, selecting relevant features and handling missing values
# For simplicity, we'll focus on the 'description' and 'points' columns
wine_data = wine_data[['description', 'points']].dropna()

# Define X and y
X = wine_data['description']
y = wine_data['points'] > 90  # Convert points to binary label: 1 if points > 90, else 0

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to vectorize text data using TF-IDF in parallel
def tfidf_vectorize(data):
    vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
    return vectorizer.fit_transform(data)

# Vectorize training and testing data using parallel processing
X_train_tfidf = Parallel(n_jobs=-1)(delayed(tfidf_vectorize)(X_train))
X_test_tfidf = Parallel(n_jobs=-1)(delayed(tfidf_vectorize)(X_test))

# Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)
y_pred_nb = naive_bayes.predict(X_test_tfidf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", accuracy_nb)

# Decision Tree classifier with optimized parameters
decision_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=5)
decision_tree.fit(X_train_tfidf, y_train)
y_pred_dt = decision_tree.predict(X_test_tfidf)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)


FileNotFoundError: [Errno 2] No such file or directory: 'winemag-data-130k-v2.csv'