In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('c:/Users/yuvra/Downloads/proj/priyanshi/Netflix Life Impact Dataset (NLID).csv')

# 1. Data Understanding and Preprocessing
# Convert Y/N percentage to numeric
df['Recommendation_Rate'] = df['Suggested to Friends/Family (Y/N %)'].str.rstrip('% Y').astype(float)

# Extract year from Release Year
df['Release_Year'] = df['Release Year']

# 2. Exploratory Data Analysis
plt.figure(figsize=(15, 6))
plt.hist(df['Average Rating'], bins=20)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Number of Movies')
plt.tight_layout()
plt.show()

# Genre Analysis
genre_counts = df['Genre'].value_counts()
plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar')
plt.title('Movie Distribution by Genre')
plt.xlabel('Genre')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Rating vs Recommendation Rate
plt.figure(figsize=(10, 6))
plt.scatter(df['Average Rating'], df['Recommendation_Rate'])
plt.title('Rating vs Recommendation Rate')
plt.xlabel('Average Rating')
plt.ylabel('Recommendation Rate (%)')
plt.tight_layout()
plt.show()

# Correlation Analysis
numeric_columns = ['Release_Year', 'Average Rating', 'Number of Reviews', 'Recommendation_Rate']
correlation = df[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

# 3. Discovery Method Analysis
discovery_counts = df['How Discovered'].value_counts()
plt.figure(figsize=(10, 6))
discovery_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('How Movies Were Discovered')
plt.axis('equal')
plt.tight_layout()
plt.show()

# 4. Top Movies Analysis
print("\nTop 10 Highest Rated Movies:")
top_rated = df.nlargest(10, 'Average Rating')[['Movie Title', 'Genre', 'Average Rating', 'Meaningful Advice Taken']]
print(top_rated)

print("\nMost Recommended Movies:")
most_recommended = df.nlargest(10, 'Recommendation_Rate')[['Movie Title', 'Genre', 'Recommendation_Rate', 'Meaningful Advice Taken']]
print(most_recommended)

# 5. Predictive Modeling
# Prepare features for prediction
X = df[['Release_Year', 'Average Rating', 'Number of Reviews']]
y = df['Recommendation_Rate']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance for Predicting Recommendation Rate')
plt.tight_layout()
plt.show()

# 6. Additional Insights
print("\nAverage Rating by Genre:")
print(df.groupby('Genre')['Average Rating'].mean().sort_values(ascending=False))

print("\nMost Common Life Lessons:")
print(df['Meaningful Advice Taken'].value_counts().head())