In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 1: Load the dataset
df = pd.read_csv('/content/Netflix Userbase.csv')

# Step 2: Inspect the data
print(df.head())
print(df.info())

# Step 3: Handle missing values (if any)
#df.fillna(df.mean(), inplace=True)  # Fill numerical columns with mean values
# If there are categorical missing values, you can fill them with mode
df['Subscription Type'].fillna(df['Subscription Type'].mode()[0], inplace=True)
df['Country'].fillna(df['Country'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Device'].fillna(df['Device'].mode()[0], inplace=True)

# Step 4: Handle date columns ('Join Date' and 'Last Payment Date')
df['Join Date'] = pd.to_datetime(df['Join Date'], format='%d-%m-%y')
df['Last Payment Date'] = pd.to_datetime(df['Last Payment Date'], format='%d-%m-%y')

# Step 5: Extract date features (year, month, day)
df['Join Year'] = df['Join Date'].dt.year
df['Join Month'] = df['Join Date'].dt.month
df['Join Day'] = df['Join Date'].dt.day

df['Last Payment Year'] = df['Last Payment Date'].dt.year
df['Last Payment Month'] = df['Last Payment Date'].dt.month
df['Last Payment Day'] = df['Last Payment Date'].dt.day

# Drop the original date columns
df.drop(columns=['Join Date', 'Last Payment Date'], inplace=True)

# Step 6: One-Hot Encoding of Categorical Variables
df = pd.get_dummies(df, drop_first=True)  # This will convert categorical columns into numerical

# Step 7: Split the data into features and target
X = df.drop(columns=['Monthly Revenue'])  # Assuming you want to predict 'Monthly Revenue'
y = df['Monthly Revenue']

# Step 8: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Scaling the data (only for numerical columns)
scaler = StandardScaler()

# Apply scaling only to the numerical columns
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

# Step 10: Train a model (for example, Linear Regression)
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Step 11: Predict and evaluate the model
y_pred = model.predict(X_test_scaled)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# You can also evaluate with other metrics (e.g., R-squared)
r2 = model.score(X_test_scaled, y_test)
print(f'R-squared: {r2}')

   User ID Subscription Type  Monthly Revenue Join Date Last Payment Date  \
0        1             Basic               10  15-01-22          10-06-23   
1        2           Premium               15  05-09-21          22-06-23   
2        3          Standard               12  28-02-23          27-06-23   
3        4          Standard               12  10-07-22          26-06-23   
4        5             Basic               10  01-05-23          28-06-23   

          Country  Age  Gender      Device Plan Duration  
0   United States   28    Male  Smartphone       1 Month  
1          Canada   35  Female      Tablet       1 Month  
2  United Kingdom   42    Male    Smart TV       1 Month  
3       Australia   51  Female      Laptop       1 Month  
4         Germany   33    Male  Smartphone       1 Month  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             -----

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Subscription Type'].fillna(df['Subscription Type'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Country'].fillna(df['Country'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte