In [184]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pymongo
import datetime

In [185]:
df = pd.read_csv('TV_Shows.Shows.csv')
df['Start Date'] = pd.to_datetime(df['Start Date']).dt.date

In [186]:
agg_df = df.groupby('Show Name', as_index=False).agg({
    'Rank': 'min',
    'Rating': 'max',
    'Activity': 'max',
    'Views': 'max',
    'Followers': 'max',
    'Show URL': 'first',
    'Start Date': 'min'  # Ensure column name matches exactly
})

In [187]:
agg_df.head()

Unnamed: 0,Show Name,Rank,Rating,Activity,Views,Followers,Show URL,Start Date
0,10 Ka Dum Season 2,33,155,0,4,0,https://www.indiaforums.com/show/10-ka-dum-sea...,2018-05-28
1,12/24 Karol Bagh,6,210,34,124,26,https://www.indiaforums.com/show/1224-karol-ba...,2010-03-01
2,2025 Jaane Kya Hoga Aagey,45,160,4,2,5,https://www.indiaforums.com/show/2025-jaane-ky...,2015-09-07
3,21 Sarfarosh: Saragarhi 1897,68,0,0,0,1,https://www.indiaforums.com/show/21-sarfarosh-...,2018-02-26
4,24 Season 2,25,158,4,7,11,https://www.indiaforums.com/show/24-season-2_5699,2016-08-08


In [188]:

cast_page_urls = []


def get_cast_page_url(show_url):
    try:
        response = requests.get(show_url)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        cast_link = soup.find('a', {'title': 'CAST'})
        if cast_link:
            return 'https://www.indiaforums.com' + cast_link['href']
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {show_url}: {e}")
    return None


for index, row in agg_df.iterrows():
    show_url = row['Show URL']
    cast_page_url = get_cast_page_url(show_url)
    cast_page_urls.append(cast_page_url)


if len(cast_page_urls) < len(agg_df):
    cast_page_urls.extend([None] * (len(agg_df) - len(cast_page_urls)))


agg_df['Cast Page URL'] = cast_page_urls


def scrape_cast(cast_page_url):
    if cast_page_url is None:
        return None
    try:
        response = requests.get(cast_page_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        cast_items = soup.find_all('div', class_='show-cast__item')
        cast_list = []
        for item in cast_items:
            name_tag = item.find('a', class_='show-cast__title')
            character_tag = item.find('p', class_='show-cast__played-as').find('a')
            if name_tag and character_tag:
                name = name_tag.text.strip()
                character = character_tag.text.strip()
                cast_list.append(f"{name} as {character}")
        return ', '.join(cast_list)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching cast page URL {cast_page_url}: {e}")
    return None


agg_df['Cast'] = agg_df['Cast Page URL'].apply(scrape_cast)


agg_df.head()

Error fetching URL https://www.indiaforums.com/show/bhagyavidhaata_4310: 524 Server Error:  for url: https://www.indiaforums.com/show/bhagyavidhaata_4310
Error fetching URL https://www.indiaforums.com/show/dil-hai-hindustani_5796: 520 Server Error:  for url: https://www.indiaforums.com/show/dil-hai-hindustani_5796


Unnamed: 0,Show Name,Rank,Rating,Activity,Views,Followers,Show URL,Start Date,Cast Page URL,Cast
0,10 Ka Dum Season 2,33,155,0,4,0,https://www.indiaforums.com/show/10-ka-dum-sea...,2018-05-28,https://www.indiaforums.com/show/10-ka-dum-sea...,Salman Khan as Host
1,12/24 Karol Bagh,6,210,34,124,26,https://www.indiaforums.com/show/1224-karol-ba...,2010-03-01,https://www.indiaforums.com/show/1224-karol-ba...,"Neil Bhatt as Abhinav Tarneja, Manit Joura as ..."
2,2025 Jaane Kya Hoga Aagey,45,160,4,2,5,https://www.indiaforums.com/show/2025-jaane-ky...,2015-09-07,https://www.indiaforums.com/show/2025-jaane-ky...,"Arvind Vaidya as Mr.Patel, Lubna Salim as Gang..."
3,21 Sarfarosh: Saragarhi 1897,68,0,0,0,1,https://www.indiaforums.com/show/21-sarfarosh-...,2018-02-26,https://www.indiaforums.com/show/21-sarfarosh-...,
4,24 Season 2,25,158,4,7,11,https://www.indiaforums.com/show/24-season-2_5699,2016-08-08,https://www.indiaforums.com/show/24-season-2_5...,Anil Kapoor as Jay Singh Rathore


In [189]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 997 entries, 0 to 996
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Show Name      997 non-null    object
 1   Rank           997 non-null    int64 
 2   Rating         997 non-null    int64 
 3   Activity       997 non-null    int64 
 4   Views          997 non-null    int64 
 5   Followers      997 non-null    int64 
 6   Show URL       997 non-null    object
 7   Start Date     997 non-null    object
 8   Cast Page URL  995 non-null    object
 9   Cast           995 non-null    object
dtypes: int64(5), object(5)
memory usage: 78.0+ KB


In [191]:

story_page_urls = []


def get_story_page_url(show_url):
    try:
        response = requests.get(show_url)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        story_link = soup.find('a', {'title': 'STORY'})
        if story_link:
            return 'https://www.indiaforums.com' + story_link['href']
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {show_url}: {e}")
    return None


for index, row in agg_df.iterrows():
    show_url = row['Show URL']
    story_page_url = get_story_page_url(show_url)
    story_page_urls.append(story_page_url)


if len(story_page_urls) < len(agg_df):
    story_page_urls.extend([None] * (len(agg_df) - len(story_page_urls)))


agg_df['Story Page URL'] = story_page_urls


def scrape_story(story_page_url):
    if story_page_url is None:
        return None
    try:
        response = requests.get(story_page_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        story_content = soup.find('main', class_='layout-main movie-reviews__content')
        if story_content:
            paragraphs = story_content.find_all('p')
            story_text = ' '.join([p.get_text(strip=True) for p in paragraphs])
            return story_text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching story page URL {story_page_url}: {e}")
    return None


agg_df['Story'] = agg_df['Story Page URL'].apply(scrape_story)


agg_df.head()

Unnamed: 0,Show Name,Rank,Rating,Activity,Views,Followers,Show URL,Start Date,Cast Page URL,Cast,Story Page URL,Story
0,10 Ka Dum Season 2,33,155,0,4,0,https://www.indiaforums.com/show/10-ka-dum-sea...,2018-05-28,https://www.indiaforums.com/show/10-ka-dum-sea...,Salman Khan as Host,https://www.indiaforums.com/show/10-ka-dum-sea...,10 Ka Dum Season 2
1,12/24 Karol Bagh,6,210,34,124,26,https://www.indiaforums.com/show/1224-karol-ba...,2010-03-01,https://www.indiaforums.com/show/1224-karol-ba...,"Neil Bhatt as Abhinav Tarneja, Manit Joura as ...",https://www.indiaforums.com/show/1224-karol-ba...,The show will see the sights of hubbub lanes a...
2,2025 Jaane Kya Hoga Aagey,45,160,4,2,5,https://www.indiaforums.com/show/2025-jaane-ky...,2015-09-07,https://www.indiaforums.com/show/2025-jaane-ky...,"Arvind Vaidya as Mr.Patel, Lubna Salim as Gang...",https://www.indiaforums.com/show/2025-jaane-ky...,Jaane Kya Hoga is the story of Joshi family wh...
3,21 Sarfarosh: Saragarhi 1897,68,0,0,0,1,https://www.indiaforums.com/show/21-sarfarosh-...,2018-02-26,https://www.indiaforums.com/show/21-sarfarosh-...,,https://www.indiaforums.com/show/21-sarfarosh-...,21 Sarfarosh: Saragarhi 1897
4,24 Season 2,25,158,4,7,11,https://www.indiaforums.com/show/24-season-2_5699,2016-08-08,https://www.indiaforums.com/show/24-season-2_5...,Anil Kapoor as Jay Singh Rathore,https://www.indiaforums.com/show/24-season-2_5...,"24 India Season 2, is the sequel to the hit 20..."


In [None]:
#agg_df.head()

In [193]:

client = pymongo.MongoClient("mongodb://localhost:27017")

# Convert DataFrame to a list of dictionaries
data_cast = agg_df.to_dict(orient="records")

# Truncate the 'Cast' field to a maximum length
MAX_STRING_LENGTH = 1024  # Example maximum string length (adjust as needed)
for item in data_cast:
    if item['Cast'] and len(item['Cast']) > MAX_STRING_LENGTH:
        item['Cast'] = item['Cast'][:MAX_STRING_LENGTH]

# Convert datetime objects to ISODate strings
for item in data_cast:
    for key, value in item.items():
        if isinstance(value, datetime.date):
            item[key] = value.strftime('%Y-%m-%d')  # Convert date to ISO format


db_cast = client["cast"]


def insert_in_chunks_cast(data, chunk_size=1000):
    # Loop through the data in chunks
    for i in range(0, len(data), chunk_size):
        chunk = data[i:i + chunk_size]
        try:
            
            db_cast.Shows.insert_many(chunk, ordered=False)
        except Exception as e:
            print(f"Error inserting chunk starting at index {i}: {e}")


insert_in_chunks_cast(data_cast)

print("Data insertion into the 'cast' database completed.")


Data insertion into the 'cast' database completed.
