In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
# Test the spark 
df = spark.createDataFrame([{"hello": "world"} for x in range(1000)])

In [3]:
df = spark.read.format("json").load("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/people-2.json")

In [4]:
df2= spark.read.json("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/people-2.json")

In [5]:
df.show()

In [6]:
df.printSchema()

In [7]:
df.columns 

In [8]:
df.describe()

In [9]:
df.describe().show()

In [10]:
# when the schema is not working 

from pyspark.sql.types import StructField,StringType,IntegerType,StructType

In [11]:
data_schema=[StructField('age',IntegerType(),True),StructField('name',StringType(),True)]

In [12]:
final_struc=StructType(fields=data_schema)

In [13]:
df=spark.read.json("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/people-2.json",schema=final_struc)

In [14]:
df.printSchema()

In [15]:
# in pandas selacting columns  df['age']
df.select('age').show()

In [16]:
df.head(2)

In [17]:
df.select(['age','name']).show()

In [19]:
#add column
df2=df.withColumn('newage',df['age']*2).show()

In [20]:
#rename column
df.withColumnRenamed('age','agenew').show()

In [21]:
# if you want to use sql  
df.createOrReplaceTempView("people")

In [22]:
results=spark.sql("SELECT * FROM people")
results.show()

In [23]:
# select using SQL where age is equal to  30
new_results=spark.sql("SELECT * FROM people WHERE age=30")

In [24]:
new_results.show() 

In [25]:
# using new data set 

df = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/appl_stock-1.csv")

In [26]:
df.printSchema()

In [27]:
df.show()

In [28]:
#filter with sql 
df.filter("_c0 < 500").show() 

In [29]:
#df.filter(df['_c1']<500).select('Volume').show()

In [30]:
df.columns 

In [31]:
# sales info  
salesinfo = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/sales_info-1.csv",header=True,inferSchema=True)

In [32]:
salesinfo.show()

In [33]:
salesinfo.printSchema()

In [34]:
company=salesinfo.groupBy('Company').mean()

In [35]:
company.show()

In [36]:
salesinfo.agg({'Sales':'sum'}).show()

In [37]:
group_data=df.groupBy("Company")

In [38]:
from pyspark.sql.functions import countDistinct,avg,stddev

In [39]:
salesinfo.select(countDistinct('Sales')).show()

In [40]:
salesinfo.select(avg('Sales')).show()

In [41]:
salesinfo.select(stddev('Sales')).show()

In [42]:
salesinfo.orderBy(salesinfo['Sales'].desc()).show()

In [43]:
# missing data 
dfmissing=df1 = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/ContainsNull.csv",header=True,inferSchema=True)

In [44]:
dfmissing.show()

In [45]:
dfmissing.na.drop(thresh=2).show()

In [46]:
#dfmissing.na.drop(how='any').show()

In [47]:
#df.na.fill(0).show()
#df.na.fill("No name",subset=['Name']).show()

In [48]:
#machine learning 
from  pyspark.ml.regression import LinearRegression

In [49]:
dfecomerce = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/africadataschool@outlook.com/Ecommerce_Customers.csv",header=True,inferSchema=True)



In [50]:
dfecomerce.show()

In [51]:
dfecomerce.printSchema()

In [52]:
from pyspark.ml.linalg import Vectors 
from pyspark.ml.feature import  VectorAssembler

In [53]:
dfecomerce.columns

In [54]:
assembler= VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='features')

In [55]:
output=assembler.transform(dfecomerce)


In [56]:
output.select('features').show()

In [57]:
output.head(1)

In [58]:
final_data= output.select('features','Yearly Amount Spent')

In [59]:
final_data.show()

In [60]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [61]:
train_data.describe().show()

In [62]:
lr=LinearRegression(labelCol='Yearly Amount Spent')

In [63]:
lr_model=lr.fit(train_data)

In [64]:
test_results=lr_model.evaluate(test_data)

In [65]:
test_results.residuals.show()

In [66]:
test_results.rootMeanSquaredError

In [67]:
final_data.describe().show()

In [68]:
test_results.r2

In [69]:
unlabeled_data=test_data.select('features')

In [70]:
predictions= lr_model.transform(unlabeled_data)

In [71]:
predictions.show()