In [11]:
# Install required packages
!pip install pyngrok streamlit scikit-learn pandas numpy nltk -q

# First, let's upload your movie file
from google.colab import files
uploaded = files.upload()

# Verify the file was uploaded
import os
if not any('movie' in fn.lower() for fn in uploaded.keys()):
    raise FileNotFoundError("Please upload your movie data file when prompted")

# Set up ngrok authtoken
from pyngrok import ngrok, conf
conf.get_default().auth_token = "2vtgEOXNSRCbgXSwkdP3VKBCsER_5rbnuwvpZh3Pai2iKfL4b"

# Kill any existing ngrok and Streamlit processes
!pkill ngrok || true
!pkill -f "streamlit run" || true

# Write the complete Streamlit app to a file
with open('app.py', 'w') as f:
    f.write('''
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer
import re

# Download all required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

# Initialize stemmer
stemmer = PorterStemmer()

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize and stem
    try:
        tokens = nltk.word_tokenize(text)
    except:
        nltk.download('punkt', quiet=True)
        tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Load and preprocess the dataset
@st.cache_data
def load_data():
    # Find the uploaded file
    import os
    movie_files = [f for f in os.listdir() if 'movie' in f.lower()]
    if not movie_files:
        st.error("No movie file found!")
        return None

    # Try reading with different formats
    for file in movie_files:
        try:
            # First try tab-separated
            df = pd.read_csv(file, sep='\t')
            if len(df.columns) == 1:
                # If only one column, try splitting
                df = pd.read_csv(file, sep=',')
        except:
            continue

        # Find the genres column (case-insensitive)
        genre_cols = [col for col in df.columns if 'genre' in col.lower()]
        if not genre_cols:
            continue

        # Standardize column names
        df = df.rename(columns={
            genre_cols[0]: 'genres',
            next((col for col in df.columns if 'title' in col.lower()), ''): 'title'
        })

        # Drop rows with missing genres
        df = df.dropna(subset=['genres'])
        df['processed_genres'] = df['genres'].apply(preprocess_text)
        return df

    st.error("Couldn't find valid genres column in any file")
    return None

def get_recommendations(title, df, tfidf_matrix, top_n=5):
    matches = df[df['title'].str.contains(title, case=False)]
    if matches.empty:
        return None
    cosine_sim = cosine_similarity(tfidf_matrix[matches.index[0]:matches.index[0]+1], tfidf_matrix)
    sim_scores = sorted(enumerate(cosine_sim[0]), key=lambda x: x[1], reverse=True)[1:top_n+1]
    return df.iloc[[i[0] for i in sim_scores]][['title', 'genres']]

# Streamlit UI
st.set_page_config(layout="wide")
st.title("🎬 Movie Recommender")

# Load data
df = load_data()
if df is not None:
    tfidf_matrix = TfidfVectorizer(stop_words='english').fit_transform(df['processed_genres'])

    title = st.text_input("Movie title:", "Toy Story")
    if st.button("Recommend"):
        with st.spinner('Finding similar movies...'):
            recs = get_recommendations(title, df, tfidf_matrix)
        if recs is None:
            st.error("Movie not found")
        else:
            st.write("### Recommendations:")
            for _, row in recs.iterrows():
                st.write(f"**{row['title']}**")
                st.write(f"*{row['genres'].replace('|', ', ')}*")
                st.write("---")

    if st.checkbox("Show raw data"):
        st.dataframe(df[['title', 'genres']].head(10))
''')

# Run the app
from pyngrok import ngrok
import subprocess
import threading
import time

def run_streamlit():
    subprocess.run(['streamlit', 'run', 'app.py', '--server.port', '8502', '--server.headless', 'true'])

threading.Thread(target=run_streamlit, daemon=True).start()
time.sleep(5)  # Wait for Streamlit to start

public_url = ngrok.connect(8502).public_url
print(f"🌐 App is running at: {public_url}")

# Keep alive
try:
    while True: time.sleep(10)
except: ngrok.kill()

Saving movie.txt to movie (9).txt
^C
🌐 App is running at: https://ccfa-34-135-19-160.ngrok-free.app
