# Project

In [1]:
import pandas as pd


# Load the dataset
df = pd.read_csv("C:\\Users\\alphi\\Downloads\\IMDB Dataset.csv\\IMDB Dataset.csv")

# Display basic info
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   review      50000 non-null  object
 1   sentiment   50000 non-null  int64 
 2   word_count  50000 non-null  int64 
 3   char_count  50000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB
None
                                              review  sentiment  word_count  \
0  One of the other reviewers has mentioned that ...          1         307   
1  A wonderful little production. <br /><br />The...          1         162   
2  I thought this was a wonderful way to spend ti...          1         166   
3  Basically there's a family where a little boy ...          0         138   
4  Petter Mattei's "Love in the Time of Money" is...          1         230   

   char_count  
0        1761  
1         998  
2         926  
3         748  
4        1317  


# Handling Missing Values

In [2]:
print(df.isnull().sum())  # Check for missing values
df.dropna(inplace=True)  # Remove missing values (if any)


review        0
sentiment     0
word_count    0
char_count    0
dtype: int64


# Handling Duplicates
## Removing duplicate values if they exist

In [3]:
df.drop_duplicates(inplace=True)


# Text Pre-processing
### Lowercasing: Convert text to lowercase.
### Removing Special Characters: Remove punctuation, symbols, and numbers.
### Removing Stopwords: Filter out words like "the," "is," "and" (using NLTK or SpaCy).
### Stemming/Lemmatization: Reduce words to their root form.
### Tokenization: Split text into words.

In [4]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary NLTK resources (only once)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Load dataset

df = pd.read_csv("C:\\Users\\alphi\\Downloads\\IMDB Dataset.csv\\IMDB Dataset.csv")

# Ensure 'review' column exists
if "review" not in df.columns:
    raise ValueError("Column 'review' not found in the dataset.")

# Preprocessing setup
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return " ".join(tokens)





# Convert Categorical Data

In [5]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

# Handle Class Imbalance

# Feature Engineering

In [6]:
df["word_count"] = df["review"].apply(lambda x: len(x.split()))
df["char_count"] = df["review"].apply(lambda x: len(x))

# Save the Cleaned Dataset

In [12]:
df.to_csv("C:\\Users\\alphi\\Downloads\\IMDB Dataset.csv\\IMDB Dataset.csv", index=False)
cleaned_df = df.copy()

In [14]:
cleaned_df.head(20)

Unnamed: 0,review,sentiment,word_count,char_count
0,One of the other reviewers has mentioned that ...,,307,1761
1,A wonderful little production. <br /><br />The...,,162,998
2,I thought this was a wonderful way to spend ti...,,166,926
3,Basically there's a family where a little boy ...,,138,748
4,"Petter Mattei's ""Love in the Time of Money"" is...",,230,1317
5,"Probably my all-time favorite movie, a story o...",,119,656
6,I sure would like to see a resurrection of a u...,,150,726
7,"This show was an amazing, fresh & innovative i...",,174,934
8,Encouraged by the positive comments about this...,,130,681
9,If you like original gut wrenching laughter yo...,,33,176


In [15]:
df.head(20)

Unnamed: 0,review,sentiment,word_count,char_count
0,One of the other reviewers has mentioned that ...,,307,1761
1,A wonderful little production. <br /><br />The...,,162,998
2,I thought this was a wonderful way to spend ti...,,166,926
3,Basically there's a family where a little boy ...,,138,748
4,"Petter Mattei's ""Love in the Time of Money"" is...",,230,1317
5,"Probably my all-time favorite movie, a story o...",,119,656
6,I sure would like to see a resurrection of a u...,,150,726
7,"This show was an amazing, fresh & innovative i...",,174,934
8,Encouraged by the positive comments about this...,,130,681
9,If you like original gut wrenching laughter yo...,,33,176


### count vectorization

In [11]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB