In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the data
try:
    df = pd.read_csv("/content/imdb_master.csv", encoding="ISO-8859-1", on_bad_lines='skip')
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading file: {e}")
    df = None  # Set df to None to avoid further errors if loading fails

# Proceed if df is loaded
if df is not None:
    # Check column names
    print("Columns in dataset:", df.columns)

    # Verify 'review' column exists and drop rows with NaN values in the review column
    if 'review' in df.columns:
        df = df.dropna(subset=['review'])
    else:
        print("No 'review' column found in the dataset.")
        raise ValueError("Expected 'review' column not found")

    # Initialize the lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    # Define the text preprocessing function
    def preprocess(text):
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        # Remove special characters and digits
        text = re.sub(r'\W', ' ', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize the text
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        return " ".join(words)

    # Apply the preprocessing function to the review column
    df['cleaned_review'] = df['review'].apply(preprocess)

    # Display a few samples of the cleaned data
    print(df[['review', 'cleaned_review']].head())
else:
    print("Data loading failed. Please check the file path or format.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data loaded successfully.
Columns in dataset: Index(['Unnamed: 0', 'type', 'review', 'label', 'file'], dtype='object')
                                              review  \
0  Once again Mr. Costner has dragged out a movie...   
1  This is an example of why the majority of acti...   
2  First of all I hate those moronic rappers, who...   
3  Not even the Beatles could write songs everyon...   
4  Brass pictures (movies is not a fitting word f...   

                                      cleaned_review  
0  mr costner dragged movie far longer necessary ...  
1  example majority action film generic boring re...  
2  first hate moronic rapper could nt act gun pre...  
3  even beatles could write song everyone liked a...  
4  brass picture movie fitting word really somewh...  


In [None]:
from gensim.models import Word2Vec

# Tokenize sentences
sentences = [review.split() for review in df['cleaned_review']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Bag of Words
vectorizer = CountVectorizer(max_features=1000)
bow_matrix = vectorizer.fit_transform(df['cleaned_review'])

# One-Hot Encoding (with binary=True)
one_hot_matrix = CountVectorizer(max_features=1000, binary=True)
one_hot_encoding = one_hot_matrix.fit_transform(df['cleaned_review'])


**Perform any two of the basic NLP tasks**

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import download

# Download VADER lexicon
download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Calculate sentiment
df['sentiment_score'] = df['cleaned_review'].apply(lambda review: sid.polarity_scores(review)['compound'])
df['sentiment'] = df['sentiment_score'].apply(lambda score: 'positive' if score > 0 else 'negative')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm") # Load the spaCy English language model
df['named_entities'] = df['cleaned_review'].apply(lambda text: [(ent.text, ent.label_) for ent in nlp(text).ents])

Link to the GITHUB https://github.com/Haris-Khan7/NLP-AUST-1
