<a href="https://colab.research.google.com/github/LazyRook01/NLP/blob/main/IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import os

# Upload the kaggle.json file if you are using Google Colab
from google.colab import files
files.upload()

# Move the kaggle.json file to the correct directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

# Set appropriate permissions
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d yasserh/imdb-movie-ratings-sentiment-analysis



Saving kaggle.json to kaggle.json
Downloading imdb-movie-ratings-sentiment-analysis.zip to /content
 97% 20.0M/20.6M [00:01<00:00, 30.6MB/s]
100% 20.6M/20.6M [00:01<00:00, 19.6MB/s]


In [9]:
# Unzip the downloaded dataset
!unzip /content/imdb-movie-ratings-sentiment-analysis.zip

# List the files in the current directory
!ls

Archive:  /content/imdb-movie-ratings-sentiment-analysis.zip
  inflating: movie.csv               
cufile.log  imdb-movie-ratings-sentiment-analysis.zip  rapidsai-csp-utils
drive	    movie.csv				       sample_data


In [10]:
import pandas as pd
df = pd.read_csv("/content/movie.csv")

In [4]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [5]:
%%time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')


# Function to preprocess text using NLTK
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]

    # Join the words back into a string
    processed_text = ' '.join(words)

    return processed_text

# Apply the preprocessing function to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the DataFrame with the processed text
print(df)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/execution.py", line 1335, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 30, in <module>
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/series.py", line 4771, in apply
    return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1123, in apply
    return self.apply_standard()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1174, in apply_standard
    mapped = lib.map_infer(
  File "pandas/_libs/lib.pyx", line 2924, in pandas._libs.lib.map_infer
  File "<timed exec>", line 22, in preprocess_text
  File "<timed exec>", line 22, in <listcomp>
  File "/usr/local/lib/python3.10/dist-packages/nltk/stem/porter.py", line 673, in stem
    stem = self._step3(stem)
  File "/usr/local/lib/python3.10/dist-packages/nltk/stem/porter.py", line 530, in _st

TypeError: ignored

In [19]:
%%time

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from multiprocessing import Pool

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text using NLTK
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stopwords, perform stemming, and lemmatization in one step
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(ps.stem(word)) for word in words if word.isalnum() and word not in stop_words]

    # Join the words back into a string
    processed_text = ' '.join(words)

    return processed_text

# Define a function to apply preprocessing to a DataFrame chunk
def preprocess_chunk(chunk):
    chunk['processed_text'] = chunk['text'].apply(preprocess_text)
    return chunk

# Number of CPU cores
num_cores = 8

# Split the DataFrame into chunks for parallel processing
df_split = np.array_split(df, num_cores)

# Create a Pool of workers
with Pool(num_cores) as pool:
    # Apply preprocessing to each chunk in parallel
    df_processed = pd.concat(pool.map(preprocess_chunk, df_split))

# Drop the 'text' column after all chunks are processed
df_processed = df_processed.drop('text', axis=1)

# Display the DataFrame with the processed text
print(df_processed)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


       label                                     processed_text
0          0  grew b 1965 watch love thunderbird mate school...
1          0  put movi dvd player sat coke chip expect hope ...
2          0  peopl know particular time past like feel need...
3          0  even though great interest biblic movi bore de...
4          1  im die hard dad armi fan noth ever chang got t...
...      ...                                                ...
39995      1  western union someth forgotten classic western...
39996      1  movi incred piec work explor everi nook cranni...
39997      0  wife watch movi plan visit sicili stromboli so...
39998      1  first watch flatlin amaz necessari featur good...
39999      1  would film good gross estim award nomin john t...

[40000 rows x 2 columns]
CPU times: user 708 ms, sys: 943 ms, total: 1.65 s
Wall time: 48.6 s


In [20]:
print(df_processed['label'].unique())

[0 1]


In [21]:
%%time

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)

# Create a CountVectorizer to convert text into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['processed_text'])
X_test = vectorizer.transform(test_df['processed_text'])

# Define the target variable
y_train = train_df['label']
y_test = test_df['label']

# Create a Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_report_result = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report_result)


Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      3966
           1       0.87      0.88      0.88      4034

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000

CPU times: user 11.9 s, sys: 32.8 s, total: 44.7 s
Wall time: 9.67 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
%%time

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)

# Create a CountVectorizer to convert text into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['processed_text'])
X_test = vectorizer.transform(test_df['processed_text'])

# Define the target variable
y_train = train_df['label']
y_test = test_df['label']

# Create a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_report_result = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report_result)


Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.72      0.72      3966
           1       0.73      0.73      0.73      4034

    accuracy                           0.73      8000
   macro avg       0.73      0.73      0.73      8000
weighted avg       0.73      0.73      0.73      8000

CPU times: user 42.6 s, sys: 0 ns, total: 42.6 s
Wall time: 42 s


In [36]:
%%time

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB  # Import Multinomial Naive Bayes
from sklearn.metrics import accuracy_score, classification_report

# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)

# Create a CountVectorizer to convert text into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['processed_text'])
X_test = vectorizer.transform(test_df['processed_text'])

# Define the target variable
y_train = train_df['label']
y_test = test_df['label']

# Create a Multinomial Naive Bayes Classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_report_result = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report_result)


Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      3966
           1       0.87      0.83      0.85      4034

    accuracy                           0.85      8000
   macro avg       0.85      0.85      0.85      8000
weighted avg       0.85      0.85      0.85      8000

CPU times: user 4.1 s, sys: 0 ns, total: 4.1 s
Wall time: 4.07 s
