In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d608c5093bd98bc87ac9486a9ac97b35f93984e50ed586331e923f2077a6bf73
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('crew_requirement').getOrCreate()

In [4]:
df = spark.read.csv('/content/cruise_ship_info.csv', inferSchema = True, header = True)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/cruise_ship_info.csv.

In [None]:
df.show(5)

In [None]:
df.printSchema()

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'Cruise_line', outputCol = "Cruise_line_index")

df_indexed = indexer.fit(df).transform(df)

In [None]:
df_indexed.show(10)

In [None]:
df_indexed.select('Cruise_line', 'Cruise_line_index').distinct().show()

In [None]:
df_indexed.groupby("Cruise_line").count().show()

In [None]:
df_indexed.columns

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruise_line_index'], outputCol = 'features')

output = assembler.transform(df_indexed)

In [None]:
output.show()

In [None]:
train_data , test_data = output.randomSplit([.8,.2])
train_data.describe().show()

In [None]:
from pyspark.ml.regression import LinearRegression
crew_req = LinearRegression(featuresCol= 'features', labelCol = 'crew')

model = crew_req.fit(train_data)



In [None]:
result = model.evaluate(train_data)

In [None]:
result.r2

In [None]:
pred = model.transform(test_data)

In [None]:
result_test = model.evaluate(test_data)

In [None]:
result_test.r2

In [None]:
pred.show()

In [None]:
pip install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


In [None]:
X.head()

In [None]:
data = spark.read.csv('/content/adult.data')
data.show()

In [None]:
labels = ['age', 'workclass', 'fnlwgt', 'education', 'numbers', 'marital', 'occupation', 'relation', 'race', 'gender', 'gain', 'loss', 'hourlypay', 'country', 'income']

In [None]:
df = data.toDF(*labels)


In [None]:
df.show(5)

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import col
new_df = df.withColumn('age' , col('age').cast('integer'))

In [None]:
new_df.printSchema()

In [None]:

for i in [ 'fnlwgt' , 'numbers', 'gain', 'loss', 'hourlypay']:
  new_df = new_df.withColumn(i , col(i).cast('integer'))

In [None]:
from pyspark.sql.functions import *

new_df.select([count(when(col(c).isNull(), c )).alias(c) for c in new_df.columns]).show()

In [None]:
df.select('workclass').distinct().show()

In [None]:
df = new_df.replace(" ?" , None)

In [None]:
from pyspark.sql.functions import *

df.select([count(when(col(c).isNull(), c )).alias(c) for c in df.columns]).show()

In [None]:
df.groupby('occupation').count().show()

In [None]:
df = df.fillna(" United-States", subset = ['country'])

In [None]:
df = df.fillna(" Private", subset = ['workclass'])

In [None]:
df = df.fillna(" Prof-specialty", subset = ['occupation'])

In [None]:
df.select([count(when(col(c).isNull(), c )).alias(c) for c in df.columns]).show()

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer , VectorAssembler



In [None]:
df.columns

In [None]:
categorical_cols = [ 'workclass',
 'education',
 'marital',
 'occupation',
 'relation',
 'race',
 'gender',
 'country']
numeric_cols = [ 'age' ,'fnlwgt' , 'numbers', 'gain', 'loss', 'hourlypay']
label = 'income'


In [None]:
indexer = [StringIndexer(inputCol = c , outputCol = f"{c}_index", handleInvalid = 'keep') for c in categorical_cols]


In [None]:
label_indexer = StringIndexer(inputCol = 'income', outputCol = 'label', handleInvalid = 'keep')

In [None]:
assembler = VectorAssembler(inputCols = [f'{c}_index' for c in categorical_cols] + numeric_cols, outputCol = 'features')

In [None]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label')


In [None]:
pipeline = Pipeline(stages = indexer + [assembler , label_indexer , lr] )

In [None]:
train_data , test_data = df.randomSplit([.8, .2])

In [None]:
model = pipeline.fit(train_data)

In [None]:
prediction = model.transform(test_data)

In [None]:
prediction.select('label', 'prediction').show()

In [None]:
prediction.groupby('label', 'prediction').count().show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'label', metricName = 'accuracy')

In [None]:
evaluator.evaluate(prediction)