In [16]:
import pandas as pd

# Load the data
train_df = pd.read_csv('train_data.csv', header=None)
test_df = pd.read_csv('test_data.csv', header=None)

# Inspect the first few rows to check the structure
print("Train Data First Row:", train_df.iloc[0, 0])
print("Test Data First Row:", test_df.iloc[0, 0])

# Split the data into columns
train_split = train_df[0].str.split(' ::: ', expand=True)
test_split = test_df[0].str.split(' ::: ', expand=True)

# Display the split data to understand the structure
print("Train Split Columns:", train_split.columns)
print("Test Split Columns:", test_split.columns)

# Display the first few rows of the split data
print(train_split.head())
print(test_split.head())


Train Data First Row: 1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.
Test Data First Row: 1 ::: Edgar's Lunch (1998) ::: L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. One day while showering, Vespa runs out of shampoo. L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. When he returns, Vespa is gone and every trace of her existence has

In [17]:
# Assign column names
train_split.columns = ['Index', 'Title_Year', 'Genre', 'Plot']
test_split.columns = ['Index', 'Title_Year', 'Plot']

# Further split 'Title_Year' into 'Title' and 'Year'
train_split[['Title', 'Year']] = train_split['Title_Year'].str.extract(r'(.+) \((\d{4})\)')
test_split[['Title', 'Year']] = test_split['Title_Year'].str.extract(r'(.+) \((\d{4})\)')

# Drop the 'Title_Year' column as it is now split
train_split = train_split.drop(columns=['Title_Year'])
test_split = test_split.drop(columns=['Title_Year'])

# Display the first few rows of the cleaned training data
print(train_split.head())
print(test_split.head())


  Index     Genre                                               Plot  \
0     1     drama  Listening in to a conversation between his doc...   
1     2  thriller  A brother and sister with a past incestuous re...   
2     3     adult  As the bus empties the students for their fiel...   
3     4     drama  To help their unemployed father make ends meet...   
4     5     drama  The film's title refers not only to the un-rec...   

                       Title  Year  
0      Oscar et la dame rose  2009  
1                      Cupid  1997  
2  Young, Wild and Wonderful  1980  
3             The Secret Sin  1915  
4            The Unrecovered  2007  
  Index                                               Plot  \
0     1  L.R. Brane loves his life - his car, his apart...   
1     2  Spain, March 1964: Quico is a very naughty chi...   
2     3  One year in the life of Albin and his family o...   
3     4  His father has died, he hasn't spoken with his...   
4     5  Before he was known intern

In [18]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to clean and preprocess text
def preprocess_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

# Apply the preprocessing function to the plot summaries
train_split['Cleaned_Plot'] = train_split['Plot'].apply(preprocess_text)
test_split['Cleaned_Plot'] = test_split['Plot'].apply(preprocess_text)

# Display the first few rows of the cleaned training data
print(train_split.head())
print(test_split.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  Index     Genre                                               Plot  \
0     1     drama  Listening in to a conversation between his doc...   
1     2  thriller  A brother and sister with a past incestuous re...   
2     3     adult  As the bus empties the students for their fiel...   
3     4     drama  To help their unemployed father make ends meet...   
4     5     drama  The film's title refers not only to the un-rec...   

                       Title  Year  \
0      Oscar et la dame rose  2009   
1                      Cupid  1997   
2  Young, Wild and Wonderful  1980   
3             The Secret Sin  1915   
4            The Unrecovered  2007   

                                        Cleaned_Plot  
0  listening conversation doctor parent 10 year o...  
1  brother sister past incestuous relationship cu...  
2  bus empty student field trip museum natural hi...  
3  help unemployed father make end meet edith twi...  
4  film title refers un recovered body ground zer...  
  Index 

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_split['Cleaned_Plot'])

# Transform the test data
X_test = tfidf_vectorizer.transform(test_split['Cleaned_Plot'])

# Display the shape of the transformed data
print(X_train.shape)
print(X_test.shape)


(54214, 5000)
(54200, 5000)


In [20]:
from sklearn.linear_model import LogisticRegression

y_train = train_split['Genre']

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)


In [21]:
# Predict on the test data
y_pred = model.predict(X_test)

# Display the predictions
print("Predicted genres for the test data:", y_pred)


Predicted genres for the test data: ['drama' 'drama' 'documentary' ... 'comedy' 'drama' 'documentary']
