In [41]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
df=pd.read_csv("telugu_movie_20k_dataset.csv")
df.head(1)

Unnamed: 0,movie_id,movie_name,genres,themes,overview,actors,director,censor_rating,imdb_rating,popularity_score,review,review_sentiment
0,1,Athadu (2008),"Comedy, Drama, Action",Politics,A sports-based inspirational film about strugg...,"Allu Arjun, Samantha",Koratala Siva,UA,5.1,17,Mass entertainer with powerful dialogues,Negative


In [43]:
#combininf needed features
data=df[["genres","themes","overview","censor_rating","review"]]
data.isnull().sum()

genres           0
themes           0
overview         0
censor_rating    0
review           0
dtype: int64

In [44]:
#combining the data
data["combining_features"]=(
    data["genres"] + ' ' +
    data["themes"] + ' ' +
    data["overview"] + ' ' +
    data["censor_rating"] + ' ' +
    data["review"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["combining_features"]=(


In [45]:
data["combining_features"]

0        Comedy, Drama, Action Politics A sports-based ...
1        Action, Drama, Romance Friendship, Love A roma...
2        Thriller, Comedy Family A revenge-driven actio...
3        Family, Comedy Revenge, Politics A socially dr...
4        Action, Thriller, Comedy Love A high-energy co...
                               ...                        
19995    Family Family, Social Message A sports-based i...
19996    Thriller Revenge, Social Message A socially dr...
19997    Action, Drama, Family Family, Social Message A...
19998    Action, Fantasy, Romance Sports, Politics A so...
19999    Romance, Drama, Fantasy Family A socially driv...
Name: combining_features, Length: 20000, dtype: object

In [46]:
lemmatizer=WordNetLemmatizer()
stopwords=set(stopwords.words("english"))

In [47]:
 #__________________________(optional)___________________
#preprocessing the data using nlp


def preprocess_data(text):
    text = str(text).lower()   # convert to string + lowercase
    tokens = word_tokenize(text)
    
    st_wor = [
        word for word in tokens
        if word.isalpha() and word not in stopwords
    ]
    
    tokens = [lemmatizer.lemmatize(word) for word in st_wor]
    
    return " ".join(tokens)


In [48]:
data["preprocess_features"]=data["combining_features"].apply(preprocess_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["preprocess_features"]=data["combining_features"].apply(preprocess_data)


In [49]:
#After text preprocessing
data[["preprocess_features","combining_features"]]

Unnamed: 0,preprocess_features,combining_features
0,comedy drama action politics inspirational fil...,"Comedy, Drama, Action Politics A sports-based ..."
1,action drama romance friendship love romantic ...,"Action, Drama, Romance Friendship, Love A roma..."
2,thriller comedy family action drama intense co...,"Thriller, Comedy Family A revenge-driven actio..."
3,family comedy revenge politics socially driven...,"Family, Comedy Revenge, Politics A socially dr..."
4,action thriller comedy love commercial enterta...,"Action, Thriller, Comedy Love A high-energy co..."
...,...,...
19995,family family social message inspirational fil...,"Family Family, Social Message A sports-based i..."
19996,thriller revenge social message socially drive...,"Thriller Revenge, Social Message A socially dr..."
19997,action drama family family social message stor...,"Action, Drama, Family Family, Social Message A..."
19998,action fantasy romance sport politics socially...,"Action, Fantasy, Romance Sports, Politics A so..."


In [50]:
#Tfidf we have to convert text to vectors
vectorize=TfidfVectorizer()
features=vectorize.fit_transform(data["preprocess_features"])

In [51]:
#now we need to use cosine to check similar movies
from sklearn.metrics.pairwise import cosine_similarity
similar=cosine_similarity(features)

In [13]:
# ---------- 1. NORMALIZE FUNCTION ----------
import re
def normalize_title(movie_title):
    if not isinstance(movie_title, str):
        return ""
    title = movie_title.lower()
    title = title.replace(" ", "")
    title = re.sub(r'[^a-z0-9]', '', title)
    return title


# ---------- 2. CREATE NORMALIZED COLUMN ----------
df['normalized_title'] = df['movie_name'].apply(normalize_title)


In [53]:
def recommend_movies(movie_input, df, similarity, top_n=5):
    
    if "normalized_title" not in df.columns:
        df["normalized_title"] = df["movie_name"].apply(normalize_title)

    normalized_input = normalize_title(movie_input)

    matches = df[df["normalized_title"] == normalized_input]

    if matches.empty:
        return []

    idx = matches.index[0]

    scores = list(enumerate(similarity[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    return [
        df.iloc[i]["movie_name"]
        for i, _ in scores[1:top_n + 1]
    ]
