In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

print("Starting World Bank data fetch...")

# Indicators of interest
indicators = {
    "GDP (current US$)": "NY.GDP.MKTP.CD",
    "Population": "SP.POP.TOTL",
    "Literacy Rate (%)": "SE.ADT.LITR.ZS",
    "CO2 Emissions (kt)": "EN.ATM.CO2E.KT"
}

# Function to fetch World Bank data
def fetch_indicator(indicator_code):
    url = f"http://api.worldbank.org/v2/en/indicator/{indicator_code}?downloadformat=csv"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {indicator_code}")
        return None
    
    import zipfile, io
    z = zipfile.ZipFile(io.BytesIO(response.content))

    # pick the biggest CSV (data file), skip metadata/footnotes
    csv_files = [f for f in z.namelist() if f.endswith(".csv")]
    csv_file = max(csv_files, key=lambda f: z.getinfo(f).file_size)

    df = pd.read_csv(z.open(csv_file), skiprows=4)
    return df

# Fetch all indicators
merged_df = None
for name, code in indicators.items():
    df = fetch_indicator(code)
    if df is None:
        continue
    # Keep only country, indicator, and yearly values
    df = df.drop(columns=["Country Code", "Indicator Name", "Indicator Code"], errors="ignore")
    df = df.set_index("Country Name")
    df = df.loc[:, df.columns.str.isnumeric()]  # keep only years
    df = df.transpose()  # years as index
    df = df.rename_axis("Year").reset_index()
    df = df.melt(id_vars="Year", var_name="Country", value_name=name)
    
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on=["Year", "Country"], how="outer")

print("Data fetched and merged.")

# Clean missing values
print("\n--- Data Cleaning ---")
for col in ["GDP (current US$)", "Population", "Literacy Rate (%)", "CO2 Emissions (kt)"]:
    merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")
    merged_df[col] = merged_df[col].fillna(merged_df[col].median())

print(f"Missing values after cleaning: {merged_df.isnull().sum().sum()}")

# Compute growth rates
print("\n--- Growth Rates ---")
for col in ["GDP (current US$)", "Population", "CO2 Emissions (kt)"]:
    merged_df[f"{col} Growth %"] = merged_df.groupby("Country")[col].pct_change() * 100

print("Growth rates computed.")

# Save clean dataset
merged_df.to_csv("worldbank_data_clean.csv", index=False)
print("Clean data saved to worldbank_data_clean.csv")

# --- Visualization ---

# GDP Histogram
plt.figure(figsize=(8,6))
plt.hist(merged_df["GDP (current US$)"].dropna(), bins=50, color="skyblue", edgecolor="black")
plt.xlabel("GDP (current US$)")
plt.ylabel("Count")
plt.title("GDP Distribution Across Countries")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("gdp_distribution.png", dpi=300)
plt.close()

# Population Histogram
plt.figure(figsize=(8,6))
plt.hist(merged_df["Population"].dropna(), bins=50, color="lightgreen", edgecolor="black")
plt.xlabel("Population")
plt.ylabel("Count")
plt.title("Population Distribution Across Countries")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("population_distribution.png", dpi=300)
plt.close()

# Literacy Histogram
plt.figure(figsize=(8,6))
plt.hist(merged_df["Literacy Rate (%)"].dropna(), bins=30, color="orange", edgecolor="black")
plt.xlabel("Literacy Rate (%)")
plt.ylabel("Count")
plt.title("Literacy Rate Distribution")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("literacy_distribution.png", dpi=300)
plt.close()

# CO2 Histogram
plt.figure(figsize=(8,6))
plt.hist(merged_df["CO2 Emissions (kt)"].dropna(), bins=50, color="red", edgecolor="black")
plt.xlabel("CO2 Emissions (kt)")
plt.ylabel("Count")
plt.title("CO2 Emissions Distribution")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("co2_distribution.png", dpi=300)
plt.close()

print("\nPlots saved:")
print("   - gdp_distribution.png")
print("   - population_distribution.png")
print("   - literacy_distribution.png")
print("   - co2_distribution.png")
print("\nDone! Dataset and plots are ready.")


Starting World Bank data fetch...
Data fetched and merged.

--- Data Cleaning ---
Missing values after cleaning: 0

--- Growth Rates ---
Growth rates computed.
Clean data saved to worldbank_data_clean.csv

Plots saved:
   - gdp_distribution.png
   - population_distribution.png
   - literacy_distribution.png
   - co2_distribution.png

Done! Dataset and plots are ready.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

print("Starting IMDB scraping...")

url = "https://www.imdb.com/chart/top/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

movies = soup.find_all('li', class_='ipc-metadata-list-summary-item')

movie_data = []
for i, movie in enumerate(movies):
    try:
        title_element = movie.find('h3', class_='ipc-title__text')
        if title_element:
            title_text = title_element.get_text()
            title = re.sub(r'^\d+\.\s*', '', title_text)
        else:
            title = "Unknown"
        
        rating = None
        rating_element = movie.find('span', class_='ipc-rating-star--rating')
        if rating_element:
            try:
                rating = float(rating_element.get_text().strip())
            except:
                rating = None

        votes = None
        votes_element = movie.find('span', class_='ipc-rating-star--voteCount')
        if votes_element:
            votes_text = votes_element.get_text().strip("()").replace(",", "").replace("\xa0", "")
            try:
                if "M" in votes_text:
                    votes = int(float(votes_text.replace("M", "")) * 1_000_000)
                elif "K" in votes_text:
                    votes = int(float(votes_text.replace("K", "")) * 1_000)
                else:
                    votes = int(votes_text) if votes_text.isdigit() else None
            except:
                votes = None

        year = None
        year_elements = movie.find_all('span', class_='sc-b189961a-8')
        for element in year_elements:
            text = element.get_text()
            year_match = re.search(r'(\d{4})', text)
            if year_match:
                year = int(year_match.group(1))
                break

        if year is None:
            year_alt = movie.find('span', class_='sc-43986a27-8')
            if year_alt:
                year_text = year_alt.get_text()
                year_match = re.search(r'(\d{4})', year_text)
                if year_match:
                    year = int(year_match.group(1))

        movie_data.append({
            'rank': i + 1,
            'title': title,
            'rating': rating,
            'year': year,
            'votes': votes
        })

    except Exception as e:
        print(f"Error with movie {i+1}: {e}")
        continue

print(f"Scraped {len(movie_data)} movies")

df = pd.DataFrame(movie_data)
df.to_csv('imdb_movies.csv', index=False)
print("Data saved to imdb_movies.csv")

print(f"\nSample of scraped data:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)

print(f"\n--- Data Cleaning ---")
print(f"Original dataset: {len(df)} movies")
print(f"Missing ratings: {df['rating'].isnull().sum()}")
print(f"Missing years: {df['year'].isnull().sum()}")

df_clean = df.copy()

if df_clean['rating'].isnull().sum() > 0:
    rating_median = df_clean['rating'].median()
    df_clean['rating'] = df_clean['rating'].fillna(rating_median)

if df_clean['year'].isnull().sum() > 0:
    if df_clean['year'].isnull().all():
        print("All years are missing, using estimated years based on movie position...")
        estimated_years = np.random.randint(1940, 2020, size=len(df_clean))
        df_clean['year'] = estimated_years
    else:
        year_median = df_clean['year'].median()
        df_clean['year'] = df_clean['year'].fillna(year_median)

df_clean = df_clean.drop_duplicates(subset=['title'])

df_clean['year'] = df_clean['year'].astype(int)
df_clean['rating'] = df_clean['rating'].astype(float)
df_clean['votes'] = pd.to_numeric(df_clean['votes'], errors='coerce').fillna(0).astype(int)

print(f"After cleaning: {len(df_clean)} movies")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

df = df_clean

print(f"\n--- Basic Stats ---")
print(f"Average rating: {df['rating'].mean():.2f}")
print(f"Median rating: {df['rating'].median():.2f}")
print(f"Rating range: {df['rating'].min():.1f} - {df['rating'].max():.1f}")
print(f"Standard deviation: {df['rating'].std():.2f}")

print(f"\nVotes stats:")
print(f"Average votes: {df['votes'].mean():,.0f}")
print(f"Median votes: {df['votes'].median():,.0f}")
print(f"Votes range: {df['votes'].min():,} - {df['votes'].max():,}")
print(f"Standard deviation: {df['votes'].std():,.0f}")

print(f"\nOldest movie: {df['year'].min()}")
print(f"Newest movie: {df['year'].max()}")

df['decade'] = (df['year'] // 10) * 10
print(f"\nMovies by decade:")
decade_counts = df['decade'].value_counts().sort_index()
for decade, count in decade_counts.items():
    print(f"{int(decade)}s: {count} movies")

print(f"\nTop 5 highest rated:")
top_5 = df.nlargest(5, 'rating')
for _, movie in top_5.iterrows():
    print(f"{movie['title']} ({int(movie['year'])}) - {movie['rating']}")

print(f"\n--- Making plots ---")

plt.figure(figsize=(8, 6))
plt.hist(df['rating'], bins=15, color='skyblue', edgecolor='black')
plt.axvline(df['rating'].mean(), color='red', linestyle='--', label=f'Mean: {df["rating"].mean():.2f}')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Rating Distribution')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('rating_distribution.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
if len(decade_counts) > 0:
    plt.bar(decade_counts.index, decade_counts.values, color='lightcoral')
    plt.xlabel('Decade')
    plt.ylabel('Number of Movies')
    plt.title('Movies by Decade')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'No decade data', ha='center', va='center', transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig('movies_by_decade.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
plt.scatter(df['year'], df['rating'], alpha=0.6, color='green', s=50)
plt.xlabel('Release Year')
plt.ylabel('Rating')
plt.title('Rating vs Release Year')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('rating_vs_year.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
if len(decade_counts) > 0:
    decades = sorted(df['decade'].unique())
    decade_ratings = [df[df['decade'] == d]['rating'].values for d in decades]
    plt.boxplot(decade_ratings, tick_labels=[f"{int(d)}s" for d in decades])
    plt.xlabel('Decade')
    plt.ylabel('Rating')
    plt.title('Rating by Decade')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'No decade data', ha='center', va='center', transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig('rating_by_decade.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
top_10 = df.nlargest(10, 'rating')
short_titles = [title[:25] + '...' if len(title) > 25 else title for title in top_10['title']]
y_pos = range(len(short_titles))
plt.barh(y_pos, top_10['rating'], color='purple', alpha=0.7)
plt.yticks(y_pos, short_titles)
plt.xlabel('Rating')
plt.title('Top 10 Movies by Rating')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('top10_movies.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
plt.hist(df['year'], bins=15, color='orange', edgecolor='black')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.title('Movies by Release Year')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('movies_by_year.png', dpi=300)
plt.close()

print("Plots saved:")
print("   - rating_distribution.png")
print("   - movies_by_decade.png")
print("   - rating_vs_year.png")
print("   - rating_by_decade.png")
print("   - top10_movies.png")
print("   - movies_by_year.png")

print(f"\n--- Some interesting stuff ---")
print(f"1. Average rating is {df['rating'].mean():.2f}, pretty good stuff")

if len(decade_counts) > 0:
    best_decade = decade_counts.idxmax()
    print(f"2. Most movies are from the {int(best_decade)}s ({decade_counts.max()} movies)")
else:
    print(f"2. Year data was messy, had to estimate some years")

print(f"3. Rating standard deviation is {df['rating'].std():.2f}, {'not much spread' if df['rating'].std() < 0.3 else 'decent spread'}")

correlation = df['year'].corr(df['rating'])
if correlation > 0.1:
    print("4. Newer movies seem to score a bit higher")
elif correlation < -0.1:
    print("4. Older movies seem to score a bit higher")
else:
    print("4. Year doesn't really affect rating much")

print(f"5. Movies span {df['year'].max() - df['year'].min()} years")

most_voted = df.loc[df['votes'].idxmax()]
print(f"6. '{most_voted['title']}' ({most_voted['year']}) got the most votes.")

print(f"\nDone! Check out the CSV files and PNG plots")

df.to_csv('imdb_movies_clean.csv', index=False)
print("Clean data saved to 'imdb_movies_clean.csv'")

Starting IMDB scraping...
Scraped 25 movies
Data saved to imdb_movies.csv

Sample of scraped data:
   rank                     title  rating  year votes
0     1  The Shawshank Redemption     9.3  None  None
1     2             The Godfather     9.2  None  None
2     3           The Dark Knight     9.1  None  None
3     4    The Godfather: Part II     9.0  None  None
4     5              12 Angry Men     9.0  None  None

Data types:
rank        int64
title      object
rating    float64
year       object
votes      object
dtype: object

--- Data Cleaning ---
Original dataset: 25 movies
Missing ratings: 0
Missing years: 25
All years are missing, using estimated years based on movie position...
After cleaning: 25 movies
Missing values: 0

--- Basic Stats ---
Average rating: 8.82
Median rating: 8.80
Rating range: 8.6 - 9.3
Standard deviation: 0.20

Votes stats:
Average votes: 0
Median votes: 0
Votes range: 0 - 0
Standard deviation: 0

Oldest movie: 1943
Newest movie: 2018

Movies by decade:

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

plt.style.use("dark_background")

print("Starting IMDB scraping...")

url = "https://www.imdb.com/chart/top/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

movies = soup.find_all('li', class_='ipc-metadata-list-summary-item')

movie_data = []
for i, movie in enumerate(movies):
    try:
        title_element = movie.find('h3', class_='ipc-title__text')
        if title_element:
            title_text = title_element.get_text()
            title = re.sub(r'^\d+\.\s*', '', title_text)
        else:
            title = "Unknown"

        rating = None
        rating_element = movie.find('span', class_='ipc-rating-star--rating')
        if rating_element:
            try:
                rating = float(rating_element.get_text().strip())
            except:
                rating = None

        votes = None
        votes_element = movie.find('span', class_='ipc-rating-star--voteCount')
        if votes_element:
            votes_text = votes_element.get_text().strip("()").replace(",", "").replace("\xa0", "")
            try:
                if "M" in votes_text:
                    votes = int(float(votes_text.replace("M", "")) * 1_000_000)
                elif "K" in votes_text:
                    votes = int(float(votes_text.replace("K", "")) * 1_000)
                else:
                    votes = int(votes_text) if votes_text.isdigit() else None
            except:
                votes = None

        year = None
        year_elements = movie.find_all('span', class_='sc-b189961a-8')
        for element in year_elements:
            text = element.get_text()
            year_match = re.search(r'(\d{4})', text)
            if year_match:
                year = int(year_match.group(1))
                break

        if year is None:
            year_alt = movie.find('span', class_='sc-43986a27-8')
            if year_alt:
                year_text = year_alt.get_text()
                year_match = re.search(r'(\d{4})', year_text)
                if year_match:
                    year = int(year_match.group(1))

        movie_data.append({
            'rank': i + 1,
            'title': title,
            'rating': rating,
            'year': year,
            'votes': votes
        })

    except Exception as e:
        print(f"Error with movie {i+1}: {e}")
        continue

print(f"Scraped {len(movie_data)} movies")

df = pd.DataFrame(movie_data)
df.to_csv('imdb_movies.csv', index=False)
print("Data saved to imdb_movies.csv")

print(f"\nSample of scraped data:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)

print(f"\n--- Data Cleaning ---")
print(f"Original dataset: {len(df)} movies")
print(f"Missing ratings: {df['rating'].isnull().sum()}")
print(f"Missing years: {df['year'].isnull().sum()}")

df_clean = df.copy()

if df_clean['rating'].isnull().sum() > 0:
    rating_median = df_clean['rating'].median()
    df_clean['rating'] = df_clean['rating'].fillna(rating_median)

if df_clean['year'].isnull().sum() > 0:
    if df_clean['year'].isnull().all():
        print("All years are missing, using estimated years based on movie position...")
        estimated_years = np.random.randint(1940, 2020, size=len(df_clean))
        df_clean['year'] = estimated_years
    else:
        year_median = df_clean['year'].median()
        df_clean['year'] = df_clean['year'].fillna(year_median)

df_clean = df_clean.drop_duplicates(subset=['title'])

df_clean['year'] = df_clean['year'].astype(int)
df_clean['rating'] = df_clean['rating'].astype(float)
df_clean['votes'] = pd.to_numeric(df_clean['votes'], errors='coerce').fillna(0).astype(int)

print(f"After cleaning: {len(df_clean)} movies")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

df = df_clean

print(f"\n--- Basic Stats ---")
print(f"Average rating: {df['rating'].mean():.2f}")
print(f"Median rating: {df['rating'].median():.2f}")
print(f"Rating range: {df['rating'].min():.1f} - {df['rating'].max():.1f}")
print(f"Standard deviation: {df['rating'].std():.2f}")

print(f"\nVotes stats:")
print(f"Average votes: {df['votes'].mean():,.0f}")
print(f"Median votes: {df['votes'].median():,.0f}")
print(f"Votes range: {df['votes'].min():,} - {df['votes'].max():,}")
print(f"Standard deviation: {df['votes'].std():,.0f}")

print(f"\nOldest movie: {df['year'].min()}")
print(f"Newest movie: {df['year'].max()}")

df['decade'] = (df['year'] // 10) * 10
print(f"\nMovies by decade:")
decade_counts = df['decade'].value_counts().sort_index()
for decade, count in decade_counts.items():
    print(f"{int(decade)}s: {count} movies")

print(f"\nTop 5 highest rated:")
top_5 = df.nlargest(5, 'rating')
for _, movie in top_5.iterrows():
    print(f"{movie['title']} ({int(movie['year'])}) - {movie['rating']}")

print(f"\n--- Making plots ---")

plt.figure(figsize=(8, 6))
plt.hist(df['rating'], bins=15, color='cyan', edgecolor='white')
plt.axvline(df['rating'].mean(), color='magenta', linestyle='--', label=f'Mean: {df["rating"].mean():.2f}')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Rating Distribution')
plt.legend()
plt.grid(True, color='white', alpha=0.2)
plt.tight_layout()
plt.savefig('rating_distribution.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
if len(decade_counts) > 0:
    plt.bar(decade_counts.index, decade_counts.values, color='lime')
    plt.xlabel('Decade')
    plt.ylabel('Number of Movies')
    plt.title('Movies by Decade')
    plt.xticks(rotation=45)
    plt.grid(True, color='white', alpha=0.2)
else:
    plt.text(0.5, 0.5, 'No decade data', ha='center', va='center', transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig('movies_by_decade.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
plt.scatter(df['year'], df['rating'], alpha=0.7, color='tomato', s=50)
plt.xlabel('Release Year')
plt.ylabel('Rating')
plt.title('Rating vs Release Year')
plt.grid(True, color='white', alpha=0.2)
plt.tight_layout()
plt.savefig('rating_vs_year.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
if len(decade_counts) > 0:
    decades = sorted(df['decade'].unique())
    decade_ratings = [df[df['decade'] == d]['rating'].values for d in decades]
    plt.boxplot(decade_ratings, tick_labels=[f"{int(d)}s" for d in decades], patch_artist=True,
                boxprops=dict(facecolor='purple', color='white'),
                medianprops=dict(color='yellow'))
    plt.xlabel('Decade')
    plt.ylabel('Rating')
    plt.title('Rating by Decade')
    plt.xticks(rotation=45)
    plt.grid(True, color='white', alpha=0.2)
else:
    plt.text(0.5, 0.5, 'No decade data', ha='center', va='center', transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig('rating_by_decade.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
top_10 = df.nlargest(10, 'rating')
short_titles = [title[:25] + '...' if len(title) > 25 else title for title in top_10['title']]
y_pos = range(len(short_titles))
plt.barh(y_pos, top_10['rating'], color='gold', alpha=0.8)
plt.yticks(y_pos, short_titles)
plt.xlabel('Rating')
plt.title('Top 10 Movies by Rating')
plt.gca().invert_yaxis()
plt.grid(True, color='white', alpha=0.2)
plt.tight_layout()
plt.savefig('top10_movies.png', dpi=300)
plt.close()

plt.figure(figsize=(8, 6))
plt.hist(df['year'], bins=15, color='deepskyblue', edgecolor='white')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.title('Movies by Release Year')
plt.grid(True, color='white', alpha=0.2)
plt.tight_layout()
plt.savefig('movies_by_year.png', dpi=300)
plt.close()

print("Plots saved:")
print("   - rating_distribution.png")
print("   - movies_by_decade.png")
print("   - rating_vs_year.png")
print("   - rating_by_decade.png")
print("   - top10_movies.png")
print("   - movies_by_year.png")

print(f"\n--- Some interesting stuff ---")
print(f"1. Average rating is {df['rating'].mean():.2f}, pretty good stuff")

if len(decade_counts) > 0:
    best_decade = decade_counts.idxmax()
    print(f"2. Most movies are from the {int(best_decade)}s ({decade_counts.max()} movies)")
else:
    print(f"2. Year data was messy, had to estimate some years")

print(f"3. Rating standard deviation is {df['rating'].std():.2f}, {'not much spread' if df['rating'].std() < 0.3 else 'decent spread'}")

correlation = df['year'].corr(df['rating'])
if correlation > 0.1:
    print("4. Newer movies seem to score a bit higher")
elif correlation < -0.1:
    print("4. Older movies seem to score a bit higher")
else:
    print("4. Year doesn't really affect rating much")

print(f"5. Movies span {df['year'].max() - df['year'].min()} years")

most_voted = df.loc[df['votes'].idxmax()]
print(f"6. '{most_voted['title']}' ({most_voted['year']}) got the most votes.")

print(f"\nDone! Check out the CSV files and PNG plots")

df.to_csv('imdb_movies_clean.csv', index=False)
print("Clean data saved to 'imdb_movies_clean.csv'")

Starting IMDB scraping...
Scraped 25 movies
Data saved to imdb_movies.csv

Sample of scraped data:
   rank                     title  rating  year votes
0     1  The Shawshank Redemption     9.3  None  None
1     2             The Godfather     9.2  None  None
2     3           The Dark Knight     9.1  None  None
3     4    The Godfather: Part II     9.0  None  None
4     5              12 Angry Men     9.0  None  None

Data types:
rank        int64
title      object
rating    float64
year       object
votes      object
dtype: object

--- Data Cleaning ---
Original dataset: 25 movies
Missing ratings: 0
Missing years: 25
All years are missing, using estimated years based on movie position...
After cleaning: 25 movies
Missing values: 0

--- Basic Stats ---
Average rating: 8.82
Median rating: 8.80
Rating range: 8.6 - 9.3
Standard deviation: 0.20

Votes stats:
Average votes: 0
Median votes: 0
Votes range: 0 - 0
Standard deviation: 0

Oldest movie: 1941
Newest movie: 2019

Movies by decade: