In [None]:
#INIZIALIZZAIZIONE SESSION E AVVIO SPARK
import pyspark as pys 
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pyspark.sql as sql

#Spark session
spark = SparkSession.builder.getOrCreate()

#Spark context
sc = spark.sparkContext


In [None]:
#CARICAMENTO DF

forest_path = './learningPySpark/Data/forest_coverage_type.csv'

forest=spark.read.csv(
    forest_path,
    header=True,
    inferSchema=True
)

forest.printSchema()
#Sono tutti campi numerici

In [None]:
#Transformers
import pyspark.sql.functions as f
import pyspark.ml.feature as feat
import numpy as np

#Bucketizer
bucket_no=10 

dist_min_max = (
    forest.agg( #prendo min e max
        f.min('Horizontal_Distance_To_Hydrology').alias("min")
        ,f.max('Horizontal_Distance_To_Hydrology').alias("max")
    ).rdd
    .map(lambda row: (row.min,row.max))
    .collect()[0] #Non prenso la lista ma solo il primo (unico) elemento

)
#range
rng = dist_min_max[1]-dist_min_max[0]

#Prendo 11 punti equidistanti nel range
#Valori limite di ogni bucket (11 tagli = 10 bucket)
#Nei tagli contano anche gli estrmi
splits = list(np.arange(
    dist_min_max[0],
    dist_min_max[1],
    rng/ (bucket_no+1)
)
)

#Creazione oggetto
bucketizer = feat.Bucketizer(
    splits=splits, #array con valori in cui tagliare
    inputCol="Horizontal_Distance_To_Hydrology",
    outputCol="Horizontal_Distance_To_Hydrology_Bkt"
)

#Trasformazione
(
    bucketizer.transform(forest).select(
        'Horizontal_Distance_To_Hydrology'
        ,'Horizontal_Distance_To_Hydrology_Bkt'
    ).show(5)
)

#DF con 2 colonne, il valori continuo e il numero di bucket

In [None]:
#Gli estimators (vedi sotto) vogliono una sola colonna
#Effettuiamo in raggrupamento

#raggruppiamo tutte e colonne nella colonna feat
vectorAssembler = (
    feat.VectorAssembler(inputCols=forest.columns,outputCol='feat')
)

#Prendiamo le 5 variabili (feature) più significative
pca = (
    feat.PCA(k=5,inputCol=vectorAssembler.getOutputCol(),outputCol='pca_feat')
)

(
    pca
    .fit(vectorAssembler.transform(forest))
    .transform(vectorAssembler.transform(forest))
    .select("feat","pca_feat")
    .take(1)
)

In [None]:
#ESTIMATORS
import pyspark.ml.classification as cl

#Cerchiamo di predirre quando la foresta avrà "cover type" = 1 (spruce-fir conifere)

#Creiamo un SVM MODEL
vectorAssembler = feat.VectorAssembler(
    inputCols=forest.columns[0:-1] #selezioniamo tutte le colonne tranne l'ultima (cover type)
    , outputCol="features"
)

fir_dataset = (
    vectorAssembler.transform(forest).withColumn( #aggiunge colonne
        # (NomeColonna,Dati)
        "label"
        ,(f.col("CoverType")==1).cast("integer") #1 = conifera
    )
    .select("label","features")
)

svc_obj = cl.LinearSVC(maxIter=10,regParam=0.01)
svc_model = svc_obj.fit(fir_dataset)

In [None]:
#Estraiamo il risultato della modellizazione
svc_model.coefficients

In [None]:
#Regressione lineare
import pyspark.ml.regression as rg

#Cerchiamo di stimare l'altezza di una foresta

vectorAssembler = feat.VectorAssembler(
    inputCols=forest.columns[1:]
    ,outputCol="features"
)

elevation_dataset = (
    vectorAssembler.transform(forest).withColumn( #aggiunge colonne
        "label",
        f.col("Elevation").cast("float")#regressione con variabili continue
    )
    .select("label","features")
)

#Modello regressione lineare
lr_obj = rg.LinearRegression(maxIter=10,regParam=0.01,elasticNetParam=1.00)
lr_model=lr_obj.fit(elevation_dataset)

#Il codice per creare un modello è abbastanza standard è di facile realizzazione

In [None]:
#Risultati della modellazione
lr_model.coefficients

#Resoconto automatico
summary = lr_model.summary

print(
    summary.r2 #78% buon risultato
    , summary.rootMeanSquaredError
    ,summary.meanAbsoluteError
)

In [None]:
#PIPELINES
from pyspark.ml import Pipeline

#link function
vectorAssembler= feat.VectorAssembler(
    inputCols=forest.columns[1:],outputCol="features"
)

#model
lr_obj=rg.GeneralizedLinearRegression(
    labelCol="Elevation"
    ,maxIter=10
    ,regParam=0.01
    ,link="identity"
    ,linkPredictionCol="p"
)

pip = Pipeline(stages=[vectorAssembler,lr_obj])

(
    pip
    .fit(forest)
    .transform(forest)
    .select("Elevation","prediction")
    .show(5)
)

#Il modello predittivo è abbastanza accurato


In [None]:
#Istogramma di elevation per capire quale modello usare (usato sopra)
import matplotlib.pyplot as plt
transformed_df = forest.select('Elevation')
transformed_df.toPandas().hist()
plt.savefig('Elevation_histogram.png')
plt.close('all')

#Sembra una ditribuzione normale

In [None]:
#SELEZIONARE LE VARIABILI PIÙ PREDITTIBILI

#TOP 10 variabili per predirre la classe si un'osservazione

#Raggruppiamo tutte le colonne in una
#Tranne l'ultima che è coverType ovvero il nostro obiettivo
vectorAssembler = feat.VectorAssembler(
    inputCols=forest.columns[0:-1]
    ,outputCol="features"
)

#Selezioni migliore feature usando il chi-square test
selector = feat.ChiSqSelector(
    labelCol="CoverType" #colonna obiettivo
    ,numTopFeatures=10
    ,outputCol="selected" #Vettore con le prime 10 colonne
)

pipeline_sel = Pipeline(stages=[vectorAssembler,selector])

(
    pipeline_sel
    .fit(forest)
    .transform(forest)
    .select(selector.getOutputCol())
    .show(5)
)


In [None]:
#Selezione di variabili continue, usando la correlazione
import pyspark.ml.stat as st

features_and_label = feat.VectorAssembler(
    inputCols=forest.columns
    ,outputCol="features"
)

#Correlazione
corr = st.Correlation.corr(
    features_and_label.transform(forest)
    ,"features"
    ,"pearson" #Coeff di Pearson
)

print(str(corr.collect()[0][0]))

#Estraiamo le 10 colonne più correlate col il nostro label (covertype)

num_of_features = 10
cols =dict([
    (i,e) for i, e in enumerate (forest.columns)
])

corr_matrix = corr.collect()[0][0]
label_corr_idx = [
    (i[0],e) for i,e in np.ndenumerate(corr_matrix.toArray()[:,0])
][1:]

label_corr_idx_sorted = sorted(
    label_corr_idx
    , key=lambda el: -abs(el[1])
)

features_selected = np.array([
    cols[el[0]]
    for el in label_corr_idx_sorted
])[0:num_of_features]
features_selected


In [None]:
#Predire forest coverage type
#Usando il logistic regression model

#Divisione fra train e test
forest_train, forest_test=(
    forest.randomSplit([0.7,0.3],seed=666)
)
#Uniamo le colonne in una
vectorAssembler = feat.VectorAssembler(
    inputCols=forest.columns[0:-1]
    ,outputCol="features"
)
#Selezioniamo le 10 feature più utili
selector = feat.ChiSqSelector(
    labelCol="CoverType"
    ,numTopFeatures=10
    ,outputCol="selected"
)

#Logistic regression model
logReg_obj = cl.LogisticRegression(
    labelCol="CoverType"
    ,featuresCol=selector.getOutputCol()
    ,regParam=0.01
    ,elasticNetParam=1.0
    ,family="multinomial"
)

pipeline = Pipeline(
    stages=[vectorAssembler,selector,logReg_obj]
)

pModel=pipeline.fit(forest_train)

In [None]:
#Testiamo il modello
import pyspark.ml.evaluation as ev

results_logReg=(
    pModel
    .transform(forest_test)
    .select("CoverType","probability","prediction")
)

evaluator = ev.MulticlassClassificationEvaluator(
    predictionCol="prediction"
    ,labelCol="CoverType"
)


(
    evaluator.evaluate(results_logReg)
    , evaluator.evaluate(results_logReg
    , {evaluator.metricName: 'weightedPrecision'}
)
    , evaluator.evaluate(
    results_logReg
    , {evaluator.metricName: 'accuracy'}
)
)

In [None]:
#Stima forest elevation

#XXX

vectorAssembler = feat.VectorAssembler(
    inputCols=forest.columns[1:] #la 1 è l'obiettivo
    , outputCol='features'
    )

rf_obj = rg.RandomForestRegressor(
    labelCol='Elevation'
    , maxDepth=10
    , minInstancesPerNode=10
    , minInfoGain=0.1
    , numTrees=10
    )
pip = Pipeline(stages=[vectorAssembler, rf_obj])

#Test performance
results = (
pip
    .fit(forest)
    .transform(forest)
    .select('Elevation', 'prediction')
)
evaluator = ev.RegressionEvaluator(labelCol='Elevation')
evaluator.evaluate(results, {evaluator.metricName: 'r2'})

#83% meglio della regressione lineare

In [None]:
#MODELLO DI CLUSTERING
import pyspark.ml.clustering as clust

vectorAssembler = feat.VectorAssembler(
    inputCols=forest.columns[:-1]
    , outputCol='features'
)

kmeans_obj = clust.KMeans(k=7,seed=666) #k numero di cluster previsto

pip=Pipeline(stages=[vectorAssembler,kmeans_obj])

#Guardiamo i risultati
results = (
    pip
    .fit(forest)
    .transform(forest)
    .select('features', 'CoverType', 'prediction')
)
results.show(5)

In [None]:
#Valutazione risultati
clustering_ev = ev.ClusteringEvaluator()
clustering_ev.evaluate(results)

#0.50 buon risultato

In [None]:
#Tuning dei parametri
#Meglio non eseguire questa cella, esecuzione molto lunga
import pyspark.ml.tuning as tune

vectorAssembler = feat.VectorAssembler(
inputCols=forest.columns[0:-1]
, outputCol='features')

selector = feat.ChiSqSelector(
labelCol='CoverType'
, numTopFeatures=5
, outputCol='selected')

logReg_obj = cl.LogisticRegression(
labelCol='CoverType'
, featuresCol=selector.getOutputCol()
, family='multinomial'
)
#Tuning di 2 parametri con 2 livelli ognuno -> 4 modelli da creare
logReg_grid = (
    tune.ParamGridBuilder()
    .addGrid(logReg_obj.regParam
    , [0.01, 0.1]
    )
    .addGrid(logReg_obj.elasticNetParam
    , [1.0, 0.5]
    )
    .build()
)

logReg_ev = ev.MulticlassClassificationEvaluator(
    predictionCol='prediction'
    , labelCol='CoverType')

cross_v = tune.CrossValidator(
    estimator=logReg_obj
    , estimatorParamMaps=logReg_grid
    , evaluator=logReg_ev
)

pipeline = Pipeline(stages=[vectorAssembler, selector])
data_trans = pipeline.fit(forest_train)

logReg_modelTest = cross_v.fit(
data_trans.transform(forest_train)
)

In [None]:
#Testiamo il modello
data_trans_test = data_trans.transform(forest_test)
results = logReg_modelTest.transform(data_trans_test)
print(logReg_ev.evaluate(results, {logReg_ev.metricName: 'weightedPrecision'}))
print(logReg_ev.evaluate(results, {logReg_ev.metricName: 'weightedRecall'}))
print(logReg_ev.evaluate(results, {logReg_ev.metricName: 'accuracy'}))

#È peggiorata a causa della scelta di 5 feature e non 10

In [None]:
#Estrarre feauture dal testo
import pyspark.ml.feature as feat
from pyspark.ml import Pipeline

#Si splitta sugli spazi e infine ci contano le ricorrenze di ogni parola e attraverso un hashing portarci il variabili numeriche

#Definisco dataset di frasi

some_text = spark.createDataFrame([
['''
Apache Spark achieves high performance for both batch
and streaming data, using a state-of-the-art DAG scheduler,
a query optimizer, and a physical execution engine.
''']
, ['''
Apache Spark is a fast and general-purpose cluster computing
system. It provides high-level APIs in Java, Scala, Python
and R, and an optimized engine that supports general execution
graphs. It also supports a rich set of higher-level tools including
Spark SQL for SQL and structured data processing, MLlib for machine
learning, GraphX for graph processing, and Spark Streaming.
''']
, ['''
Machine learning is a field of computer science that often uses
statistical techniques to give computers the ability to "learn"
(i.e., progressively improve performance on a specific task)
with data, without being explicitly programmed.
''']
], ['text'])


#Splitto usando spazi , . \ "
splitter = feat.RegexTokenizer(
    inputCol='text'
    , outputCol='text_split'
    , pattern='\s+|[,.\"]'
)

#Rimuovo parole non significative (stop-word)
sw_remover = feat.StopWordsRemover(
    inputCol=splitter.getOutputCol()
    , outputCol='no_stopWords'
)

#Hashing
hasher = feat.HashingTF(
    inputCol=sw_remover.getOutputCol()
    , outputCol='hashed'
    , numFeatures=20
)

#frequenty-inverse document frequency
#frequenza della parola nel testo / in tutti i testi
#Misura l'importanza della parola nel testo
idf = feat.IDF(
    inputCol=hasher.getOutputCol()
    , outputCol='features'
)

pipeline = Pipeline(stages=[splitter, sw_remover, hasher, idf])
pipelineModel = pipeline.fit(some_text)



In [None]:
#Disretizzazione variabili continue

#Usiamo un DF diverso da prima
signal_df = spark.read.csv(
'./learningPySpark/Data/fourier_signal.csv'
, header=True
, inferSchema=True
)

#Discretizzatore
steps = feat.QuantileDiscretizer(
    numBuckets=10,
    inputCol='signal',
    outputCol='discretized')

#DF trasformato
transformed = (
    steps
    .fit(signal_df)
    .transform(signal_df)
)

In [None]:
#Standardizzazione variabili continue

#Rappresentazione vettorizzata
vec = feat.VectorAssembler(
    inputCols=['signal']
    , outputCol='signal_vec'
)

norm = feat.StandardScaler( #accetta solo Rappresentazione vettorizzata
    inputCol=vec.getOutputCol()
    , outputCol='signal_norm'
    , withMean=True #=0
    , withStd=True #=1
)

norm_pipeline = Pipeline(stages=[vec, norm])
signal_norm = (
    norm_pipeline
    .fit(signal_df)
    .transform(signal_df)
)

In [None]:
#Divisione in bucket in base alle parole

articles = spark.createDataFrame([
('''
    The Andromeda Galaxy, named after the mythological
    Princess Andromeda, also known as Messier 31, M31,
    or NGC 224, is a spiral galaxy approximately 780
    kiloparsecs (2.5 million light-years) from Earth,
    and the nearest major galaxy to the Milky Way.
    Its name stems from the area of the sky in which it
    appears, the constellation of Andromeda. The 2006
    observations by the Spitzer Space Telescope revealed
    that the Andromeda Galaxy contains approximately one
    trillion stars, more than twice the number of the
    Milky Way’s estimated 200-400 billion stars. The
    Andromeda Galaxy, spanning approximately 220,000 light
    years, is the largest galaxy in our Local Group,
    which is also home to the Triangulum Galaxy and
    other minor galaxies. The Andromeda Galaxy's mass is
    estimated to be around 1.76 times that of the Milky
    Way Galaxy (~0.8-1.5×1012 solar masses vs the Milky
    Way's 8.5×1011 solar masses).
    ''','Galaxy', 'Andromeda')
    (...)
, ('''
Washington, officially the State of Washington, is a state in the Pacific
Northwest region of the United States. Named after George Washington,
the first president of the United States, the state was made out of the
western part of the Washington Territory, which was ceded by Britain in
1846 in accordance with the Oregon Treaty in the settlement of the
Oregon boundary dispute. It was admitted to the Union as the 42nd state
in 1889. Olympia is the state capital. Washington is sometimes referred
to as Washington State, to distinguish it from Washington, D.C., the
capital of the United States, which is often shortened to Washington.
''','Geography', 'Washington State')
], ['articles', 'Topic', 'Object'])

#Come visto prima

splitter = feat.RegexTokenizer(
inputCol='articles'
, outputCol='articles_split'
, pattern='\s+|[,.\"]'
)
sw_remover = feat.StopWordsRemover(
inputCol=splitter.getOutputCol()
, outputCol='no_stopWords'
)
count_vec = feat.CountVectorizer(inputCol=sw_remover.getOutputCol()
, outputCol='vector'
)

#Clusterizzazione
#Latent dirichlet allocation
lda_clusters = clust.LDA(
    k=3 #ci aspettimo 3 cluster
    , optimizer='online'
    , featuresCol=count_vec.getOutputCol()
)

topic_pipeline = Pipeline(
    stages=[
    splitter
    , sw_remover
    , count_vec
    , lda_clusters
    ]
)


In [None]:
#Test
for topic in (
    topic_pipeline
    .fit(articles)
    .transform(articles)
    .select('Topic','Object','topicDistribution')
    .take(10)
):
    print(
    topic.Topic
    , topic.Object
    , np.argmax(topic.topicDistribution)
    , topic.topicDistribution
    )

In [None]:
spark.stop()
sc.stop()

In [1]:
%%help

UsageError: Cell magic `%%help` not found.
