In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

# Opprett en liste av anmeldelser og deres kategorier
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle datasettet
np.random.shuffle(documents)

# Konverter data til en Pandas DataFrame
df = pd.DataFrame(documents, columns=['words', 'category'])

# Konverter ordene til setninger (str)
df['words'] = df['words'].apply(' '.join)

# Del datasettet inn i trenings- og testsett
X_train, X_test, y_train, y_test = train_test_split(df['words'], df['category'], test_size=0.3, random_state=42)

# Bruk CountVectorizer for å konvertere teksten til vektorer
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Opprett en Naive Bayes modell
nb_classifier = MultinomialNB()

# Tren modellen på treningsdata
nb_classifier.fit(X_train_vec, y_train)

# Prediker på testdata
y_pred = nb_classifier.predict(X_test_vec)

# Beregn nøyaktighet
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Nøyaktighet:", accuracy)



[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Nøyaktighet: 0.83


In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn import metrics
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split

# Download NLTK's movie_reviews corpus
nltk.download('movie_reviews')

# Load movie_reviews data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle dataset
np.random.shuffle(documents)

# Convert data to a Pandas DataFrame
df = pd.DataFrame(documents, columns=['words', 'category'])

# Convert words to sentences (str)
df['words'] = df['words'].apply(' '.join)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['words'], df['category'], test_size=0.3, random_state=42)

# Preprocessing: Use CountVectorizer to convert text to vectors
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Supervised Learning: Create and train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Predict on test data
y_pred = nb_classifier.predict(X_test_vec)

# Evaluate supervised learning model
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Supervised Learning Accuracy:", accuracy)

# Unsupervised Learning: Apply K-Means clustering
kmeans = KMeans(n_clusters=2)  # Assuming 2 clusters for positive and negative
kmeans.fit(X_train_vec)

# Predict clusters on test data
clusters = kmeans.predict(X_test_vec)

# Evaluate unsupervised learning using accuracy (since we know the true labels)
unsupervised_accuracy = metrics.accuracy_score(y_test, clusters)
print("Unsupervised Learning Accuracy:", unsupervised_accuracy)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Supervised Learning Accuracy: 0.7833333333333333


TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=['neg' 'pos'] and y_pred=[0 1]. Make sure that the predictions provided by the classifier coincides with the true labels.

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn import metrics
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split

# Download NLTK's movie_reviews corpus
nltk.download('movie_reviews')

# Load movie_reviews data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle dataset
np.random.shuffle(documents)

# Convert data to a Pandas DataFrame
df = pd.DataFrame(documents, columns=['words', 'category'])

# Convert words to sentences (str)
df['words'] = df['words'].apply(' '.join)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['words'], df['category'], test_size=0.3, random_state=42)

# Preprocessing: Use CountVectorizer to convert text to vectors
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Supervised Learning: Create and train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Predict on test data
y_pred = nb_classifier.predict(X_test_vec)

# Evaluate supervised learning model
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Supervised Learning Accuracy:", accuracy)

# Unsupervised Learning: Apply K-Means clustering
kmeans = KMeans(n_clusters=2)  # Assuming 2 clusters for positive and negative
kmeans.fit(X_train_vec)

# Predict clusters on test data
clusters = kmeans.predict(X_test_vec)

# Convert cluster labels to match 'neg' and 'pos' labels
# Here, we assume the majority label in each cluster represents the cluster
cluster_labels = {0: 'neg', 1: 'pos'}  # Mapping of cluster label to 'neg' or 'pos'
clusters_mapped = [cluster_labels[cluster] for cluster in clusters]

# Evaluate unsupervised learning using accuracy (since we know the true labels)
unsupervised_accuracy = metrics.accuracy_score(y_test, clusters_mapped)
print("Unsupervised Learning Accuracy:", unsupervised_accuracy)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Supervised Learning Accuracy: 0.845
Unsupervised Learning Accuracy: 0.445
