# Modelling 

### Standardization

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

# Assuming 'df' is your PySpark DataFrame and 'feature_column' is the column you want to standardize

# Step 1: Assemble the feature column into a Vector
assembler = VectorAssembler(inputCols=['feature_column'], outputCol='feature_vector')
df_vector = assembler.transform(df)

# Step 2: Apply StandardScaler to the vectorized column
scaler = StandardScaler(inputCol='feature_vector', outputCol='scaled_feature', withMean=True, withStd=True)
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

# The 'scaled_feature' column now contains the standardized values
# If you want to extract the scaled feature back to a column:
from pyspark.ml.functions import vector_to_array
df_scaled = df_scaled.withColumn('scaled_feature_column', vector_to_array('scaled_feature')[0])

# Select the original and the standardized columns for viewing
df_scaled.select('feature_column', 'scaled_feature_column').show()


### Categorical Encoding

In [None]:
# One-hot encoding
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Indexing the categorical columns
vendor_indexer = StringIndexer(inputCol="VendorID", outputCol="VendorID_index")
ratecode_indexer = StringIndexer(inputCol="RatecodeID", outputCol="RatecodeID_index")
payment_type_indexer = StringIndexer(inputCol="payment_type", outputCol="payment_type_index")

# OneHotEncoding the indexed columns
vendor_encoder = OneHotEncoder(inputCol="VendorID_index", outputCol="VendorID_vec")
ratecode_encoder = OneHotEncoder(inputCol="RatecodeID_index", outputCol="RatecodeID_vec")
payment_type_encoder = OneHotEncoder(inputCol="payment_type_index", outputCol="payment_type_vec")

# Creating a pipeline to chain indexers and encoders
pipeline = Pipeline(stages=[vendor_indexer, ratecode_indexer, payment_type_indexer,
                            vendor_encoder, ratecode_encoder, payment_type_encoder])

# Fit the pipeline and transform the data
model = pipeline.fit(combined)
encoded_df = model.transform(combined)

# Drop the original columns after encoding
encoded_df = encoded_df.drop("VendorID", "RatecodeID", "payment_type")


## Feature selection

## Data split

## Linear regression

## Random Forest 