<a href="https://colab.research.google.com/github/Laiba-raza/NLP/blob/main/NLP_CCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install -q gensim sentence-transformers spacy nltk kaggle streamlit joblib pyngrok
!python -m spacy download en_core_web_sm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 

In [2]:
from google.colab import files
import os
print('Upload kaggle.json now (from your Kaggle account -> API -> create token)')
uploaded = files.upload()
if 'kaggle.json' in uploaded:
    os.makedirs('/root/.kaggle', exist_ok=True)
    with open('/root/.kaggle/kaggle.json','wb') as f:
        f.write(uploaded['kaggle.json'])
    os.chmod('/root/.kaggle/kaggle.json',0o600)
    print('kaggle.json uploaded')
else:
    print('No kaggle.json uploaded. Continue if dataset already present.')

Upload kaggle.json now (from your Kaggle account -> API -> create token)


Saving kaggle.json to kaggle.json
kaggle.json uploaded


In [3]:
import os
if os.path.exists('/root/.kaggle/kaggle.json'):
    !kaggle datasets download -d tmdb/tmdb-movie-metadata -p /content --unzip
else:
    print('kaggle.json not found; upload dataset manually to /content')

Dataset URL: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
License(s): other
Downloading tmdb-movie-metadata.zip to /content
  0% 0.00/8.89M [00:00<?, ?B/s]
100% 8.89M/8.89M [00:00<00:00, 865MB/s]


In [4]:
import pandas as pd, numpy as np, re, spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load dataset
candidates = ['tmdb_5000_movies.csv','movies.csv','movies_metadata.csv']
for c in candidates:
    if os.path.exists(c):
        df = pd.read_csv(c)
        break
else:
    raise FileNotFoundError('Dataset not found. Upload tmdb_5000_movies.csv')

# Standardize
if 'overview' not in df.columns and 'description' in df.columns:
    df = df.rename(columns={'description':'overview'})
if 'overview' not in df.columns:
    raise ValueError('No overview column found')
df = df[['title','overview']].copy()
df['overview'] = df['overview'].fillna('')

nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text,str):
        return ''
    text = text.lower()
    text = re.sub(r'<.*?>',' ',text)
    text = re.sub(r'[^a-z0-9\s]',' ',text)
    text = re.sub(r'\s+',' ',text).strip()
    return text

def preprocess_texts(texts):
    cleaned=[]
    tokenized=[]
    for doc in nlp.pipe(texts, batch_size=50):
        toks=[]
        for token in doc:
            if token.is_alpha and not token.is_stop and len(token.lemma_)>1:
                toks.append(token.lemma_.lower())
        tokenized.append(toks)
        cleaned.append(' '.join(toks))
    return cleaned, tokenized

print('Preprocessing...')
df['overview_clean'] = df['overview'].astype(str).apply(clean_text)
cleaned_texts, tokenized_texts = preprocess_texts(df['overview_clean'].tolist())
df['overview_proc']=cleaned_texts

# TF-IDF
print('TF-IDF...')
tfidf = TfidfVectorizer(max_features=8000, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df['overview_proc'].tolist())
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Word2Vec
print('Word2Vec...')
w2v = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=2, workers=4)
w2v.train(tokenized_texts, total_examples=len(tokenized_texts), epochs=30)
import numpy as np
def avg_w2v(tok_texts, model):
    dim = model.vector_size
    vecs = np.zeros((len(tok_texts), dim))
    for i,toks in enumerate(tok_texts):
        v = [model.wv[t] for t in toks if t in model.wv]
        if v:
            vecs[i]=np.mean(v, axis=0)
    return vecs

avg_vecs = avg_w2v(tokenized_texts, w2v)
w2v_sim = cosine_similarity(avg_vecs, avg_vecs)

# Sentence-BERT
print('Sentence-BERT...')
sent_model = SentenceTransformer('all-MiniLM-L6-v2')
sent_emb = sent_model.encode(df['overview_proc'].tolist(), show_progress_bar=True, convert_to_numpy=True)
sent_sim = cosine_similarity(sent_emb, sent_emb)

# simple recommend
indices = pd.Series(df.index, index=df['title']).to_dict()

def recommend(title, top_k=10, weights=(0.4,0.2,0.4)):
    if title not in indices:
        raise ValueError('title not found')
    idx = indices[title]
    w = np.array(weights); w = w / w.sum()
    combined = w[0]*tfidf_sim[idx] + w[1]*w2v_sim[idx] + w[2]*sent_sim[idx]
    sims = list(enumerate(combined))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)
    sims = [s for s in sims if s[0] != idx]
    top = sims[:top_k]
    return df.iloc[[i for i,_ in top]][['title','overview']]

print('Ready. Example:')
print(recommend(df['title'].iloc[10], top_k=5))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Preprocessing...
TF-IDF...
Word2Vec...




Sentence-BERT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/151 [00:00<?, ?it/s]

Ready. Example:
                                        title  \
870                               Superman II   
2433         Superman IV: The Quest for Peace   
813                                  Superman   
1296                             Superman III   
3854  Batman: The Dark Knight Returns, Part 2   

                                               overview  
870   Three escaped criminals from the planet Krypto...  
2433  With global superpowers engaged in an increasi...  
813   Mild-mannered Clark Kent works as a reporter a...  
1296  Aiming to defeat the Man of Steel, wealthy exe...  
3854  Batman has stopped the reign of terror that Th...  


In [5]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import joblib
import os

# Load preprocessed dataset and models (generated in Colab notebook)
DATA_PATH = "tmdb_5000_movies.csv"
MODEL_DIR = "models"

st.title("🎬 Movie Recommendation System (NLP-based)")

# Load dataset
@st.cache_data
def load_data():
    df = pd.read_csv(DATA_PATH)
    return df

movies_df = load_data()

# Load models
def load_models():
    models = {}
    for name in ["tfidf", "word2vec", "sbert"]:
        path = os.path.join(MODEL_DIR, f"{name}_model.pkl")
        if os.path.exists(path):
            models[name] = joblib.load(path)
    return models

models = load_models()

# Recommendation function (simple placeholder)
def recommend(movie_title, top_n=5):
    if movie_title not in movies_df['title'].values:
        return []
    # TODO: Replace with ensemble method
    similar_movies = movies_df.sample(top_n)[['title', 'overview']]
    return similar_movies

# Streamlit UI
st.write("Select a movie and get recommendations:")

selected_movie = st.selectbox("Choose a movie:", movies_df['title'].values[:500])

if st.button("Recommend"):
    recs = recommend(selected_movie)
    if len(recs) > 0:
        st.write("### Recommended Movies:")
        for i, row in recs.iterrows():
            st.write(f"🎥 **{row['title']}**")
            st.write(row['overview'])
            st.write("---")
    else:
        st.warning("No recommendations found.")


Writing streamlit_app.py


In [6]:
!pip install streamlit pyngrok



In [7]:
from pyngrok import ngrok

# Replace with your actual ngrok token
NGROK_AUTH_TOKEN = "32vBzs9NgUleaS0dZPeiqIHDjzf_5idFuDzNoHi5PSGaka867"
!ngrok config add-authtoken $NGROK_AUTH_TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [8]:
!ls

kaggle.json  streamlit_app.py	    tmdb_5000_movies.csv
sample_data  tmdb_5000_credits.csv


In [9]:
!nohup streamlit run streamlit_app.py --server.port 8501 &

nohup: appending output to 'nohup.out'


In [10]:
public_url = ngrok.connect(8501)
print("Your Streamlit app is live at:", public_url)

Your Streamlit app is live at: NgrokTunnel: "https://73667bb11404.ngrok-free.app" -> "http://localhost:8501"
