In [2]:

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("StringOps") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

In [3]:

df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",",") \
.csv("simple_dirty_data.csv")

In [4]:
df.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [12]:
from pyspark.sql.functions import *

1 Concat

In [6]:
df.select("meslek","sehir") \
.withColumn("meslek_sehir", concat(col("meslek"),lit(" - "),col("sehir"))) \
.show(truncate=False)

+-----------+-----------+------------------------+
|meslek     |sehir      |meslek_sehir            |
+-----------+-----------+------------------------+
|Isci       |Ankara     |Isci - Ankara           |
|Memur      |Kayseri    |Memur - Kayseri         |
|Müzüsyen   |Istanbul   |Müzüsyen - Istanbul     |
|Pazarlamacı|    Ankara |Pazarlamacı -     Ankara|
|Pazarlamaci|Bursa      |Pazarlamaci - Bursa     |
|Memur      |Ankara     |Memur - Ankara          |
|Pazarlamaci|Istanbul   |Pazarlamaci - Istanbul  |
|Müzüsyen   |Istanbul   |Müzüsyen - Istanbul     |
|Doktor     |Ankara     |Doktor - Ankara         |
|Berber     | Istanbul  |Berber -  Istanbul      |
|Tuhafiyeci |null       |null                    |
|Tornacı    | Ankara    |Tornacı -  Ankara       |
|memur      |Çorum      |memur - Çorum           |
|Doktor     |İzmir      |Doktor - İzmir          |
|Müzisyen   | Ankara    |Müzisyen -  Ankara      |
+-----------+-----------+------------------------+



2. Number Format

In [7]:
df.select("aylik_gelir") \
.withColumn("aylik_gelir_format", format_number(col("aylik_gelir"),2)).show()

+-----------+------------------+
|aylik_gelir|aylik_gelir_format|
+-----------+------------------+
|     3500.0|          3,500.00|
|     4200.0|          4,200.00|
|     9000.0|          9,000.00|
|     4200.0|          4,200.00|
|     4800.0|          4,800.00|
|     4250.0|          4,250.00|
|     7300.0|          7,300.00|
|    12000.0|         12,000.00|
|   180000.0|        180,000.00|
|    12000.0|         12,000.00|
|        4.8|              4.80|
|     4200.0|          4,200.00|
|     3750.0|          3,750.00|
|    14250.0|         14,250.00|
|     8700.0|          8,700.00|
+-----------+------------------+



3.lower, initcap,length

In [18]:
df.withColumn("meslek_lower",lower(col("meslek"))) \
.withColumn("isim_initcap",initcap(col("isim"))) \
.withColumn("sehir_length", length(col("sehir"))).show(10,truncate=False)

+------+--------+---+--------+-----------+-----------+-----------+----------------------+------------+------------+------------+
|sirano|isim    |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk              |meslek_lower|isim_initcap|sehir_length|
+------+--------+---+--------+-----------+-----------+-----------+----------------------+------------+------------+------------+
|1     |Cemal   |35 |E       |Isci       |Ankara     |3500.0     |araba                 |isci        |Cemal       |6           |
|2     |ceyda   |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev              |memur       |Ceyda       |7           |
|3     |Timur   |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık       |müzüsyen    |Timur       |11          |
|4     |Burcu   |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba                 |pazarlamacı |Burcu       |10          |
|5     |Yasemin |23 |K       |Pazarlamaci|Bursa      |4800.0     |araba                 |pazarlam

trim

In [20]:
df_trim = df \
.withColumn("sehir_rtrim", rtrim(col("sehir"))) \
.withColumn("sehir_ltrim", ltrim(col("sehir"))) \
.withColumn("sehir_trim", trim(col("sehir")))

df_trim.show(n=5, truncate=False)

+------+-------+---+--------+-----------+-----------+-----------+---------------+-----------+-----------+----------+
|sirano|isim   |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk       |sehir_rtrim|sehir_ltrim|sehir_trim|
+------+-------+---+--------+-----------+-----------+-----------+---------------+-----------+-----------+----------+
|1     |Cemal  |35 |E       |Isci       |Ankara     |3500.0     |araba          |Ankara     |Ankara     |Ankara    |
|2     |ceyda  |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev       |Kayseri    |Kayseri    |Kayseri   |
|3     |Timur  |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık|Istanbul   |Istanbul   |Istanbul  |
|4     |Burcu  |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba          |    Ankara |Ankara     |Ankara    |
|5     |Yasemin|23 |K       |Pazarlamaci|Bursa      |4800.0     |araba          |Bursa      |Bursa      |Bursa     |
+------+-------+---+--------+-----------+-----------+-----------

replace, split 

In [28]:
df \
.withColumn("sehir_ist",regexp_replace(col("sehir"),"Kay","GAYY")) \
.withColumn("mal_mulk_split",split(col("mal_mulk"),"\\|")).show(10,truncate= False)

+------+--------+---+--------+-----------+-----------+-----------+----------------------+-----------+---------------------------+
|sirano|isim    |yas|cinsiyet|meslek     |sehir      |aylik_gelir|mal_mulk              |sehir_ist  |mal_mulk_split             |
+------+--------+---+--------+-----------+-----------+-----------+----------------------+-----------+---------------------------+
|1     |Cemal   |35 |E       |Isci       |Ankara     |3500.0     |araba                 |Ankara     |[araba]                    |
|2     |ceyda   |42 |K       |Memur      |Kayseri    |4200.0     |araba|ev              |GAYYseri   |[araba, ev]                |
|3     |Timur   |30 |null    |Müzüsyen   |Istanbul   |9000.0     |araba|ev|yazlık       |Istanbul   |[araba, ev, yazlık]        |
|4     |Burcu   |29 |K       |Pazarlamacı|    Ankara |4200.0     |araba                 |    Ankara |[araba]                    |
|5     |Yasemin |23 |K       |Pazarlamaci|Bursa      |4800.0     |araba                 |B

In [30]:
df.show()

+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|sirano|     isim|yas|cinsiyet|     meslek|      sehir|aylik_gelir|            mal_mulk|
+------+---------+---+--------+-----------+-----------+-----------+--------------------+
|     1|    Cemal| 35|       E|       Isci|     Ankara|     3500.0|               araba|
|     2|   ceyda | 42|       K|      Memur|    Kayseri|     4200.0|            araba|ev|
|     3|    Timur| 30|    null|   Müzüsyen|Istanbul   |     9000.0|     araba|ev|yazlık|
|     4|   Burcu | 29|       K|Pazarlamacı|     Ankara|     4200.0|               araba|
|     5|  Yasemin| 23|       K|Pazarlamaci|      Bursa|     4800.0|               araba|
|     6|      Ali| 33|       E|      Memur|     Ankara|     4250.0|                  ev|
|     7|    Dilek| 29|       K|Pazarlamaci|   Istanbul|     7300.0|        araba|yazlık|
|     8|    Murat| 31|       E|   Müzüsyen|   Istanbul|    12000.0|araba|ev|dükkan|y...|
|     9|    Ahmet| 33

In [31]:
from pyspark.sql import functions as F

In [54]:
df2= df \
.withColumn("isim",  F.trim(F.initcap(df.isim)))\
.withColumn("cinsiyet", F.when(df.cinsiyet.isNull(),"U").otherwise(df.cinsiyet)) \
.withColumn("sehir", F.when(df.sehir.isNull(),"BİLİNMİYOR").otherwise(F.trim(F.upper(df.sehir)))) \


temizlenmis DataFrame i diske yazma

In [55]:
df2.show(15)

+------+--------+---+--------+-----------+----------+-----------+--------------------+
|sirano|    isim|yas|cinsiyet|     meslek|     sehir|aylik_gelir|            mal_mulk|
+------+--------+---+--------+-----------+----------+-----------+--------------------+
|     1|   Cemal| 35|       E|       Isci|    ANKARA|     3500.0|               araba|
|     2|   Ceyda| 42|       K|      Memur|   KAYSERI|     4200.0|            araba|ev|
|     3|   Timur| 30|       U|   Müzüsyen|  ISTANBUL|     9000.0|     araba|ev|yazlık|
|     4|   Burcu| 29|       K|Pazarlamacı|    ANKARA|     4200.0|               araba|
|     5| Yasemin| 23|       K|Pazarlamaci|     BURSA|     4800.0|               araba|
|     6|     Ali| 33|       E|      Memur|    ANKARA|     4250.0|                  ev|
|     7|   Dilek| 29|       K|Pazarlamaci|  ISTANBUL|     7300.0|        araba|yazlık|
|     8|   Murat| 31|       E|   Müzüsyen|  ISTANBUL|    12000.0|araba|ev|dükkan|y...|
|     9|   Ahmet| 33|       E|     Doktor| 

In [56]:
df2 \
.coalesce(1) \
.write \
.mode("overwrite") \
.option("sep",",")  \
.option("header","True") \
.csv("simple_dirth_data_clean")

In [57]:
df2.printSchema()

root
 |-- sirano: integer (nullable = true)
 |-- isim: string (nullable = true)
 |-- yas: integer (nullable = true)
 |-- cinsiyet: string (nullable = true)
 |-- meslek: string (nullable = true)
 |-- sehir: string (nullable = true)
 |-- aylik_gelir: double (nullable = true)
 |-- mal_mulk: string (nullable = true)



elle sema olusturma

In [58]:
df.printSchema()

root
 |-- sirano: integer (nullable = true)
 |-- isim: string (nullable = true)
 |-- yas: integer (nullable = true)
 |-- cinsiyet: string (nullable = true)
 |-- meslek: string (nullable = true)
 |-- sehir: string (nullable = true)
 |-- aylik_gelir: double (nullable = true)
 |-- mal_mulk: string (nullable = true)



In [79]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("OnlineRetail.csv") \


In [80]:
df=df\
.withColumn("UnitPrice",regexp_replace(col("UnitPrice"),",",".") )

In [65]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [81]:
df \
.coalesce(1) \
.write \
.mode("overwrite") \
.option("sep",",")  \
.option("header","True") \
.csv("OnlineRetail")

In [59]:
from pyspark.sql.types import StructType, StructField,StringType,IntegerType,FloatType

In [72]:
manual_schema = StructType(
[    
    StructField("InvoiceNo", StringType(), True),
     StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("Country", StringType(), True)
       
     
]
)

In [87]:
df2 = spark.read \
.option("header","True") \
.schema(manual_schema) \
.option("sep",",") \
.csv("C:\\Users\\mwolf\\3D Objects\\python veri bilimi\\SPARK\\OnlineRetail\\part-00000-aa02ac45-07a9-4214-a560-bc0575b4ec0b-c000.csv")

In [88]:
pwd

'C:\\Users\\mwolf\\3D Objects\\python veri bilimi\\SPARK'

In [89]:
df2.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [90]:
df2.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)

