# Proyecto Final

## Importación
Llamamos a pySpark e importamos todo lo necesario para el proyecto.

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
sc=SparkContext(master="local[3]")
print(sc)
from pyspark.sql import SparkSession

<SparkContext master=local[3] appName=pyspark-shell>


In [2]:
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)
import pyspark.ml.feature as ft
import pyspark.sql.types as typ
import pandas as pd
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString, OneHotEncoder, VectorAssembler, ChiSqSelector, PCA
from pyspark.ml import Pipeline
import pyspark.ml.evaluation as ev
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.ml.classification as cl
from pyspark.sql.functions import isnan, when, count, col
import pyspark.ml.tuning as tune

## Creación de esquema
Debido a que nuestro esquema inicial era bastante complejo, hemos creado este para asi darles los valores genéricos de String e Int a nuestras variables.

In [3]:
labels = [
    ('index', typ.IntegerType()),
    ('action_type', typ.StringType()),
    ('combined_shot_type', typ.StringType()),
    ('loc_x', typ.IntegerType()),
    ('loc_y', typ.IntegerType()),
    ('minutes_remaining', typ.IntegerType()),
    ('period', typ.IntegerType()),
    ('playoffs', typ.IntegerType()),
    ('season', typ.StringType()),
    ('seconds_remaining', typ.IntegerType()),
    ('shot_distance', typ.IntegerType()),
    ('shot_made_flag', typ.StringType()),
    ('shot_type', typ.StringType()),
    ('shot_zone_area', typ.StringType()),
    ('shot_zone_basic', typ.StringType()),
    ('shot_zone_range', typ.StringType()),
    ('game_date', typ.StringType()),
    ('matchup', typ.StringType()),
    ('opponent', typ.StringType()),
    ('shot_id', typ.IntegerType()),
    ('angulo', typ.StringType())
]
     
schema = typ.StructType([
    typ.StructField(e[0], e[1], False) for e in labels
])

## Importamos el DataSet

In [4]:
datos = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data I/Proyecto/Datos/DataFrameLimpioOficial.csv')
datosDF = sqlContext.createDataFrame(datos,schema)

## Creación get_dummy
Esta función lo que nos permite es, principalmente indexar los strings, para que el OneHotEncoder pueda funcionar.

Así podemos convertir nuestro dataset original en un DataFrame compuesto por el vector features, que contiene todos los Strings indexados además de las variables numéricas, y la columna label, que contiene el valor que queremos predecir.

In [5]:
def get_dummy(df,categoricalCols,continuousCols,labelCol):
    
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col
    
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols ]
    
    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]
    
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + continuousCols, outputCol="features")
    
    pipeline = Pipeline(stages=indexers + encoders + [assembler])
    
    model=pipeline.fit(df)
    data = model.transform(df)
    
    data = data.withColumn('label',col(labelCol))
    
    return data.select('features','label')

In [6]:
catcols = ['action_type','combined_shot_type','season','shot_type','shot_zone_area','shot_zone_basic','shot_zone_range','game_date','matchup','opponent','angulo']

num_cols = ['index','loc_x','loc_y','minutes_remaining','period','playoffs','seconds_remaining','shot_distance','shot_id']
labelCol = 'shot_made_flag'

data = get_dummy(datosDF,catcols,num_cols,labelCol)
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(1774,[0,56,68,80...|  NaN|
|(1774,[0,56,68,80...|  0.0|
|(1774,[0,56,68,80...|  1.0|
|(1774,[0,56,68,80...|  0.0|
|(1774,[11,58,68,8...|  1.0|
+--------------------+-----+
only showing top 5 rows



### Pasamos nuestro dataframe por el indexador de variables categóricas.
    Primero nuestro label: labelIndexer.
    Segundo el vector features: featureIndexer.

In [7]:
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data)
labelIndexer.transform(data).show(5, True)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(1774,[0,56,68,80...|  NaN|         2.0|
|(1774,[0,56,68,80...|  0.0|         0.0|
|(1774,[0,56,68,80...|  1.0|         1.0|
|(1774,[0,56,68,80...|  0.0|         0.0|
|(1774,[11,58,68,8...|  1.0|         1.0|
+--------------------+-----+------------+
only showing top 5 rows



In [8]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", \
                              outputCol="indexedFeatures", \
                              maxCategories=4).fit(data)
featureIndexer.transform(data).show(5, True)

+--------------------+-----+--------------------+
|            features|label|     indexedFeatures|
+--------------------+-----+--------------------+
|(1774,[0,56,68,80...|  NaN|(1774,[0,56,68,80...|
|(1774,[0,56,68,80...|  0.0|(1774,[0,56,68,80...|
|(1774,[0,56,68,80...|  1.0|(1774,[0,56,68,80...|
|(1774,[0,56,68,80...|  0.0|(1774,[0,56,68,80...|
|(1774,[11,58,68,8...|  1.0|(1774,[11,58,68,8...|
+--------------------+-----+--------------------+
only showing top 5 rows



# Separación de datos

Ahora lo que hacemos es separar nuestro dataframe en e:
1. Creamos el dataframe de los nulos al que llamamos dataNulos y lo quitamos del original mediente el .subtract.
2. Creamos el dataframe de entrenamiento con el 80% de los datos.
3. Creamos el dataframe de test mediante el .subtract, obteniendo así el 20% restante.

In [9]:
dataNulos = data.where(isnan(col("label")))

In [10]:
dataSN = data.subtract(dataNulos)

In [11]:
train=data.sample(0.8,200)

In [12]:
test = data.subtract(train)

# Creamos nuestro clasificador y lo evaluamos

Utilizamos los parámetros por degecto del cl.LogisticRegression

In [13]:
logistic = cl.LogisticRegression(labelCol='indexedLabel')

Creamos el labelConverter, cuya función es desindexar lo ya indexado.

In [14]:
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
labels=labelIndexer.labels)

In [15]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, logistic,labelConverter])

In [16]:
model = pipeline.fit(train)

In [17]:
# Make predictions.
predictions = model.transform(test)
# Select example rows to display.
predictions.select("features","label","predictedLabel").show(5)

+--------------------+-----+--------------+
|            features|label|predictedLabel|
+--------------------+-----+--------------+
|(1774,[0,56,61,80...|  0.0|           0.0|
|(1774,[0,56,61,80...|  0.0|           0.0|
|(1774,[0,56,62,82...|  0.0|           0.0|
|(1774,[0,56,62,84...|  0.0|           0.0|
|(1774,[0,56,66,80...|  0.0|           0.0|
+--------------------+-----+--------------+
only showing top 5 rows



In [18]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# Hacemos la predicción final

In [19]:
# Make predictions.
predictions = model.transform(dataNulos)
# Select example rows to display.
prediccion = predictions.select("predictedLabel")

# Lo pasamos todo al formato que nos pide kaggle

In [20]:
prediccion.toPandas().to_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [21]:
datosNulos = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data I/Proyecto/Datos/DataFrameNulos.csv')

In [22]:
datosPrediccion = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [23]:
datosNulos = datosNulos.drop('index', 1)
datosNulos = datosNulos.drop('action_type', 1)
datosNulos = datosNulos.drop('combined_shot_type', 1)
datosNulos = datosNulos.drop('loc_x', 1)
datosNulos = datosNulos.drop('loc_y', 1)
datosNulos = datosNulos.drop('minutes_remaining', 1)
datosNulos = datosNulos.drop('period', 1)
datosNulos = datosNulos.drop('playoffs', 1)
datosNulos = datosNulos.drop('season', 1)
datosNulos = datosNulos.drop('seconds_remaining', 1)
datosNulos = datosNulos.drop('shot_distance', 1)
datosNulos = datosNulos.drop('shot_made_flag', 1)
datosNulos = datosNulos.drop('shot_type', 1)
datosNulos = datosNulos.drop('shot_zone_area', 1)
datosNulos = datosNulos.drop('shot_zone_basic', 1)
datosNulos = datosNulos.drop('shot_zone_range', 1)
datosNulos = datosNulos.drop('game_date', 1)
datosNulos = datosNulos.drop('matchup', 1)
datosNulos = datosNulos.drop('opponent', 1)
datosNulos = datosNulos.drop('angulo', 1)

In [24]:
submission = pd.DataFrame(
    {
        "shot_id":datosNulos.shot_id,
        "shot_made_flag":datosPrediccion.predictedLabel
    }
)

In [25]:
df=pd.DataFrame(submission)
df.to_csv('submission.csv',header=True,index=False)

# Hyper-Tunning

In [26]:
dataHT = dataSN.sample(0.1,200)
trainHT = dataHT.sample(0.8,200)
testHT = dataHT.subtract(trainHT)

In [27]:
model2 = pipeline.fit(trainHT)

In [28]:
grid = tune.ParamGridBuilder().addGrid(logistic.maxIter, [2, 10, 50, 100]).addGrid(logistic.regParam, [0.0, 0.01, 0.05, 0.3 ]).build()

In [29]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

In [30]:
cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

In [31]:
data_train = model2.transform(trainHT)
data_train = data_train.drop('prediction')
data_train = data_train.drop('rawPrediction')
data_train = data_train.drop('probability')

In [None]:
cvModel = cv.fit(data_train)

In [None]:
data_test = model2.transform(testHT)
data_test = data_test.drop('prediction')
data_test = data_test.drop('rawPrediction')
data_test = data_test.drop('probability')

In [None]:
results = cvModel.transform(data_test)

In [None]:
accuracy = evaluator.evaluate(results)
accuracy

In [None]:
print('Best Param (MaxIter):',cvModel.bestModel._java_obj.getMaxIter())

In [None]:
print('Best Param (regParam):',cvModel.bestModel._java_obj.getRegParam())

In [None]:
logistic = cl.LogisticRegression(
    maxIter = cvModel.bestModel._java_obj.getMaxIter(), 
    regParam = cvModel.bestModel._java_obj.getRegParam(), 
    labelCol='indexedLabel')

In [None]:
pipelineHT = Pipeline(stages=[labelIndexer, featureIndexer, logistic,labelConverter])
modelHT = pipelineHT.fit(train)
predictionsHT = modelHT.transform(test)
predictionsHT.select("features","label","predictedLabel").show(5)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionsHT)
accuracy

In [None]:
predictionsHT = modelHT.transform(dataNulos)
prediccionHT = predictionsHT.select("predictedLabel")

In [None]:
prediccionHT.toPandas().to_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data I/Proyecto/Datos/DataFrameNulos.csv')

In [None]:
datosPrediccion = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = datosNulos.drop('index', 1)
datosNulos = datosNulos.drop('action_type', 1)
datosNulos = datosNulos.drop('combined_shot_type', 1)
datosNulos = datosNulos.drop('loc_x', 1)
datosNulos = datosNulos.drop('loc_y', 1)
datosNulos = datosNulos.drop('minutes_remaining', 1)
datosNulos = datosNulos.drop('period', 1)
datosNulos = datosNulos.drop('playoffs', 1)
datosNulos = datosNulos.drop('season', 1)
datosNulos = datosNulos.drop('seconds_remaining', 1)
datosNulos = datosNulos.drop('shot_distance', 1)
datosNulos = datosNulos.drop('shot_made_flag', 1)
datosNulos = datosNulos.drop('shot_type', 1)
datosNulos = datosNulos.drop('shot_zone_area', 1)
datosNulos = datosNulos.drop('shot_zone_basic', 1)
datosNulos = datosNulos.drop('shot_zone_range', 1)
datosNulos = datosNulos.drop('game_date', 1)
datosNulos = datosNulos.drop('matchup', 1)
datosNulos = datosNulos.drop('opponent', 1)
datosNulos = datosNulos.drop('angulo', 1)

In [None]:
submission = pd.DataFrame(
    {
        "shot_id":datosNulos.shot_id,
        "shot_made_flag":datosPrediccion.predictedLabel
    }
)

In [None]:
df=pd.DataFrame(submission)
df.to_csv('submissionHT.csv',header=True,index=False)

# Normalizar Variables Continuas 

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['loc_x'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='loc_x_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('loc_x')
datosDF = datosDF.drop('output')

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['loc_y'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='loc_y_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('loc_y')
datosDF = datosDF.drop('output')

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['shot_distance'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='shot_distance_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('shot_distance')
datosDF = datosDF.drop('output')

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['minutes_remaining'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='minutes_remaining_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('minutes_remaining')
datosDF = datosDF.drop('output')

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['period'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='period_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('period')
datosDF = datosDF.drop('output')

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['playoffs'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='playoffs_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('playoffs')
datosDF = datosDF.drop('output')

In [None]:
vectorizer = ft.VectorAssembler(inputCols=['seconds_remaining'],outputCol= 'output')
normalizer = ft.StandardScaler(inputCol=vectorizer.getOutputCol(),outputCol='seconds_remaining_normalized',withMean=True,withStd=True)
pipeline = Pipeline(stages=[vectorizer, normalizer]) 
datosDF = pipeline.fit(datosDF).transform(datosDF)
datosDF = datosDF.drop('seconds_remaining')
datosDF = datosDF.drop('output')

In [None]:
catcols = ['action_type','combined_shot_type','season','shot_type','shot_zone_area','shot_zone_basic','shot_zone_range','game_date','matchup','opponent','angulo']

num_cols = ['index','loc_x_normalized','loc_y_normalized','minutes_remaining_normalized','period_normalized','playoffs_normalized','seconds_remaining_normalized','shot_distance_normalized','shot_id']
labelCol = 'shot_made_flag'

data = get_dummy(datosDF,catcols,num_cols,labelCol)
data.show(5)

In [None]:
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data)
labelIndexer.transform(data).show(5, True)

In [None]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", \
                              outputCol="indexedFeatures", \
                              maxCategories=4).fit(data)
featureIndexer.transform(data).show(5, True)

In [None]:
dataNulosNVC = data.where(isnan(col("label")))

In [None]:
dataSN = data.subtract(dataNulosNVC)

In [None]:
trainNVC=dataSN.sample(0.8,200)
trainNVC.count()

In [None]:
testNVC = dataSN.subtract(trainNVC)

In [None]:
logistic = cl.LogisticRegression(
    maxIter = cvModel.bestModel._java_obj.getMaxIter(), 
    regParam = cvModel.bestModel._java_obj.getRegParam(), 
    labelCol='indexedLabel')

In [None]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
labels=labelIndexer.labels)

In [None]:
# Chain indexers and tree in a Pipeline
pipelineNVC = Pipeline(stages=[labelIndexer, featureIndexer, logistic,labelConverter])

In [None]:
# Train model. This also runs the indexers.
modelNVC = pipelineNVC.fit(trainNVC)

In [None]:
# Make predictions.
predictionsNVC = modelNVC.transform(testNVC)
# Select example rows to display.
prediccionNVC = predictionsNVC.select("features","label","predictedLabel").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionsNVC)
accuracy

In [None]:
predictionsNVC = modelNVC.transform(dataNulosNVC)
prediccionNVC = predictionsNVC.select("predictedLabel")

In [None]:
prediccionNVC.toPandas().to_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data I/Proyecto/Datos/DataFrameNulos.csv')

In [None]:
datosPrediccion = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = datosNulos.drop('index', 1)
datosNulos = datosNulos.drop('action_type', 1)
datosNulos = datosNulos.drop('combined_shot_type', 1)
datosNulos = datosNulos.drop('loc_x', 1)
datosNulos = datosNulos.drop('loc_y', 1)
datosNulos = datosNulos.drop('minutes_remaining', 1)
datosNulos = datosNulos.drop('period', 1)
datosNulos = datosNulos.drop('playoffs', 1)
datosNulos = datosNulos.drop('season', 1)
datosNulos = datosNulos.drop('seconds_remaining', 1)
datosNulos = datosNulos.drop('shot_distance', 1)
datosNulos = datosNulos.drop('shot_made_flag', 1)
datosNulos = datosNulos.drop('shot_type', 1)
datosNulos = datosNulos.drop('shot_zone_area', 1)
datosNulos = datosNulos.drop('shot_zone_basic', 1)
datosNulos = datosNulos.drop('shot_zone_range', 1)
datosNulos = datosNulos.drop('game_date', 1)
datosNulos = datosNulos.drop('matchup', 1)
datosNulos = datosNulos.drop('opponent', 1)
datosNulos = datosNulos.drop('angulo', 1)

In [None]:
submission = pd.DataFrame(
    {
        "shot_id":datosNulos.shot_id,
        "shot_made_flag":datosPrediccion.predictedLabel
    }
)

In [None]:
df=pd.DataFrame(submission)
df.to_csv('submissionNVC.csv',header=True,index=False)

# PCA

In [None]:
pca = PCA(k=10, inputCol="features", outputCol="pca_features")
model = pca.fit(data)
dataPCA = model.transform(data)
dataPCA.show(5)

In [None]:
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(dataPCA)
labelIndexer.transform(dataPCA).show(5, True)

In [None]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="pca_features", \
                              outputCol="indexedFeatures", \
                              maxCategories=4).fit(dataPCA)
featureIndexer.transform(dataPCA).show(5, True)

In [None]:
dataNulosPCA = dataPCA.where(isnan(col("label")))

In [None]:
dataSN = dataPCA.subtract(dataNulosPCA)

In [None]:
trainPCA=dataSN.sample(0.8,200)

In [None]:
testPCA = dataSN.subtract(trainPCA)

In [None]:
logistic = cl.LogisticRegression(
    maxIter = cvModel.bestModel._java_obj.getMaxIter(), 
    regParam = cvModel.bestModel._java_obj.getRegParam(), 
    labelCol='indexedLabel')

In [None]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
labels=labelIndexer.labels)

In [None]:
# Chain indexers and tree in a Pipeline
pipelinePCA = Pipeline(stages=[labelIndexer, featureIndexer, logistic,labelConverter])

In [None]:
# Train model. This also runs the indexers.
modelPCA = pipelinePCA.fit(trainPCA)

In [None]:
# Make predictions.
predictionsPCA = modelPCA.transform(testPCA)
# Select example rows to display.
prediccionPCA = predictionsPCA.select("pca_features","label","predictedLabel").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionsPCA)
accuracy

In [None]:
predictionsPCA = modelPCA.transform(dataNulosPCA)
prediccionPCA = predictionsPCA.select("predictedLabel")

In [None]:
prediccionPCA.toPandas().to_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data I/Proyecto/Datos/DataFrameNulos.csv')

In [None]:
datosPrediccion = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = datosNulos.drop('index', 1)
datosNulos = datosNulos.drop('action_type', 1)
datosNulos = datosNulos.drop('combined_shot_type', 1)
datosNulos = datosNulos.drop('loc_x', 1)
datosNulos = datosNulos.drop('loc_y', 1)
datosNulos = datosNulos.drop('minutes_remaining', 1)
datosNulos = datosNulos.drop('period', 1)
datosNulos = datosNulos.drop('playoffs', 1)
datosNulos = datosNulos.drop('season', 1)
datosNulos = datosNulos.drop('seconds_remaining', 1)
datosNulos = datosNulos.drop('shot_distance', 1)
datosNulos = datosNulos.drop('shot_made_flag', 1)
datosNulos = datosNulos.drop('shot_type', 1)
datosNulos = datosNulos.drop('shot_zone_area', 1)
datosNulos = datosNulos.drop('shot_zone_basic', 1)
datosNulos = datosNulos.drop('shot_zone_range', 1)
datosNulos = datosNulos.drop('game_date', 1)
datosNulos = datosNulos.drop('matchup', 1)
datosNulos = datosNulos.drop('opponent', 1)
datosNulos = datosNulos.drop('angulo', 1)

In [None]:
submission = pd.DataFrame(
    {
        "shot_id":datosNulos.shot_id,
        "shot_made_flag":datosPrediccion.predictedLabel
    }
)

In [None]:
df=pd.DataFrame(submission)
df.to_csv('submissionPCA.csv',header=True,index=False)

# Extracción de Características

In [None]:
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(data)
dataEC = labelIndexer.transform(data)

In [None]:
selector = ChiSqSelector(numTopFeatures=10, featuresCol="features",
                             outputCol="selectedFeatures", labelCol="indexedLabel")

model = selector.fit(dataEC)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())


In [None]:
print("ChiSqSelector output with top %d features selected" % 
selector.getNumTopFeatures())
dataEC = model.transform(dataEC)
dataEC = dataEC.drop('indexedLabel')
dataEC.show(5)

In [None]:
importantFeatures = model.selectedFeatures

In [None]:
importantFeatures

In [None]:
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(dataEC)
labelIndexer.transform(dataEC).show(5, True)

In [None]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="selectedFeatures", \
                              outputCol="indexedFeatures", \
                              maxCategories=4).fit(dataEC)
featureIndexer.transform(dataEC).show(5, True)

In [None]:
dataNulosEC = dataEC.where(isnan(col("label")))

In [None]:
dataSN = dataEC.subtract(dataNulosEC)

In [None]:
trainEC=dataSN.sample(0.8,200)

In [None]:
testEC = dataSN.subtract(trainEC)

In [None]:
logistic = cl.LogisticRegression(
    maxIter = cvModel.bestModel._java_obj.getMaxIter(), 
    regParam = cvModel.bestModel._java_obj.getRegParam(), 
    labelCol='indexedLabel')

In [None]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
labels=labelIndexer.labels)

In [None]:
# Chain indexers and tree in a Pipeline
pipelineEC = Pipeline(stages=[labelIndexer, featureIndexer, logistic,labelConverter])

In [None]:
# Train model. This also runs the indexers.
modelEC = pipelineEC.fit(trainEC)

In [None]:
# Make predictions.
predictionsEC = modelEC.transform(testEC)
# Select example rows to display.
prediccionEC = predictionsEC.select("selectedFeatures","label","predictedLabel").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictionsEC)
accuracy

In [None]:
predictionsEC = modelEC.transform(dataNulosEC)
prediccionEC = predictionsEC.select("predictedLabel")

In [None]:
prediccionEC.toPandas().to_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data I/Proyecto/Datos/DataFrameNulos.csv')

In [None]:
datosPrediccion = pd.read_csv('/Users/joxea/OneDrive/Documentos/UEM/Segundo Curso/Proyecto de Open Data II/prediccion.csv')

In [None]:
datosNulos = datosNulos.drop('index', 1)
datosNulos = datosNulos.drop('action_type', 1)
datosNulos = datosNulos.drop('combined_shot_type', 1)
datosNulos = datosNulos.drop('loc_x', 1)
datosNulos = datosNulos.drop('loc_y', 1)
datosNulos = datosNulos.drop('minutes_remaining', 1)
datosNulos = datosNulos.drop('period', 1)
datosNulos = datosNulos.drop('playoffs', 1)
datosNulos = datosNulos.drop('season', 1)
datosNulos = datosNulos.drop('seconds_remaining', 1)
datosNulos = datosNulos.drop('shot_distance', 1)
datosNulos = datosNulos.drop('shot_made_flag', 1)
datosNulos = datosNulos.drop('shot_type', 1)
datosNulos = datosNulos.drop('shot_zone_area', 1)
datosNulos = datosNulos.drop('shot_zone_basic', 1)
datosNulos = datosNulos.drop('shot_zone_range', 1)
datosNulos = datosNulos.drop('game_date', 1)
datosNulos = datosNulos.drop('matchup', 1)
datosNulos = datosNulos.drop('opponent', 1)
datosNulos = datosNulos.drop('angulo', 1)

In [None]:
submission = pd.DataFrame(
    {
        "shot_id":datosNulos.shot_id,
        "shot_made_flag":datosPrediccion.predictedLabel
    }
)

In [None]:
df=pd.DataFrame(submission)
df.to_csv('submissionEC.csv',header=True,index=False)