### LINEAR REGRESSION FOR THE NEXT 2 WEEKS

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### 1. Data Preparation and Feature Engineering

In [18]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Assume 'song_id' is the unique identifier for the track/artist combination
# and 'Seconds since Epoch' provides the chronological order.
window_spec = Window.partitionBy('song_id').orderBy('Seconds since Epoch')

df = df.withColumn(
    'next_week_rank',
    F.lead('peak_rank', 1).over(window_spec)
)

df = df.withColumn(
    'two_week_ahead_rank',
    F.lead('peak_rank', 2).over(window_spec)
)

# Drop rows where the target is null (the last two weeks of data for each song)
df_train = df.na.drop(subset=['next_week_rank', 'two_week_ahead_rank'])

RuntimeError: SparkContext or SparkSession should be created first.

In [None]:
from pyspark.ml.feature import VectorAssembler

# List the features you want to use
feature_columns = [
    'trackAppearanceCount', 'artistAppearanceCount', 'peak_rank',
    'previous_rank', 'weeks_on_chart', 'streams', 'rank difference',
    'Position over Time', 'ArtistCount', 'isTopTen', 'IsGirlGroup',
    'IsBoyGroup', 'IsMixedGroup'
]

vector_assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol='features'
)

In [None]:
# Assuming your data is ordered by time, select a cutoff point
# For simplicity, let's use a random split for now, but in time-series
# use a time-based split (e.g., 90% oldest for training, 10% newest for testing)
train_df, test_df = df_train.randomSplit([0.8, 0.2], seed=42)

In [None]:
from pyspark.ml.regression import LinearRegression

# 1. Initialize the Linear Regression estimator
lr_t1 = LinearRegression(
    labelCol='next_week_rank', # Target variable
    featuresCol='features',
    regParam=0.1,             # Regularization parameter
    elasticNetParam=0.0       # 0.0 for L2 (Ridge) regularization
)

# 2. Create the Pipeline
from pyspark.ml import Pipeline

pipeline_t1 = Pipeline(stages=[vector_assembler, lr_t1])

# 3. Train the model
model_t1 = pipeline_t1.fit(train_df)

print("T+1 Model Training Complete.")

In [None]:
# 1. Initialize the Linear Regression estimator
lr_t2 = LinearRegression(
    labelCol='two_week_ahead_rank', # New target variable
    featuresCol='features',
    regParam=0.1,
    elasticNetParam=0.0
)

# 2. Create the Pipeline
pipeline_t2 = Pipeline(stages=[vector_assembler, lr_t2])

# 3. Train the model
model_t2 = pipeline_t2.fit(train_df)

print("T+2 Model Training Complete.")

In [None]:
# The original rows where the targets were NULL (the latest two weeks of data per song)
df_pred = df.filter(F.col('next_week_rank').isNull() | F.col('two_week_ahead_rank').isNull())

# Further filtering might be necessary to isolate ONLY the latest week (t) and second-to-latest week (t-1)
# to predict t+1 and t+2, respectively.
# For a robust prediction, you usually use the very last week (t) of known data for each song.

# Get only the last known observation (time t) for each song to predict t+1 and t+2
window_last = Window.partitionBy('song_id').orderBy(F.desc('Seconds since Epoch'))
df_latest = df.withColumn('row_num', F.row_number().over(window_last)).filter(F.col('row_num') == 1).drop('row_num')

In [None]:
# Predict rank for the next week (t+1)
predictions_t1 = model_t1.transform(df_latest).select(
    'song_id', 'peak_rank', 'Weeks_on_chart',
    F.round(F.col('prediction')).alias('predicted_rank_t_plus_1')
)

# Predict rank for the week after (t+2)
predictions_t2 = model_t2.transform(df_latest).select(
    'song_id',
    F.round(F.col('prediction')).alias('predicted_rank_t_plus_2')
)

In [None]:
final_predictions = predictions_t1.join(predictions_t2, on='song_id', how='inner')

final_predictions.show()
# Example output:
# +-------+---------+----------------+-----------------------+-----------------------+
# |song_id|peak_rank|Weeks_on_chart|predicted_rank_t_plus_1|predicted_rank_t_plus_2|
# +-------+---------+----------------+-----------------------+-----------------------+
# |song_A |    5    |      10        |           7           |           10          |
# |song_B |   45    |       3        |          40           |          35           |
# +-------+---------+----------------+-----------------------+-----------------------+