In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
#Tokenizer
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Start Spark
spark = SparkSession.builder.appName("TweetAnalysis").getOrCreate()

# read CSV
df = spark.read.csv("hdfs:///hdfs/tweet/ProjectTweets.csv", header=False, inferSchema=True)



                                                                                

In [6]:
#show df
df.show()


+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|  5|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  6|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  7|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  8|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nop

In [7]:
# show schema
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [8]:
# Rename columns name
df = df.withColumnRenamed("_c0", "number") \
        .withColumnRenamed("_c1", "ids") \
        .withColumnRenamed("_c2", "date") \
        .withColumnRenamed("_c3", "flag") \
        .withColumnRenamed("_c4", "user") \
        .withColumnRenamed("_c5", "text")

In [9]:
# show schema
df.printSchema()

root
 |-- number: integer (nullable = true)
 |-- ids: long (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



In [11]:
#see the all text on the rows
df.select("text").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |
|my whole body feels itchy and like its on fire                                                                       |
|@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.       |
|@Kwesidei not the whole crew           

### Comments;
There are unneccesary punctaion marks, hastags and stop words

In [13]:
data= df.select("date", "text")
data.show()

+--------------------+--------------------+
|                date|                text|
+--------------------+--------------------+
|Mon Apr 06 22:19:...|@switchfoot http:...|
|Mon Apr 06 22:19:...|is upset that he ...|
|Mon Apr 06 22:19:...|@Kenichan I dived...|
|Mon Apr 06 22:19:...|my whole body fee...|
|Mon Apr 06 22:19:...|@nationwideclass ...|
|Mon Apr 06 22:20:...|@Kwesidei not the...|
|Mon Apr 06 22:20:...|         Need a hug |
|Mon Apr 06 22:20:...|@LOLTrish hey  lo...|
|Mon Apr 06 22:20:...|@Tatiana_K nope t...|
|Mon Apr 06 22:20:...|@twittera que me ...|
|Mon Apr 06 22:20:...|spring break in p...|
|Mon Apr 06 22:20:...|I just re-pierced...|
|Mon Apr 06 22:20:...|@caregiving I cou...|
|Mon Apr 06 22:20:...|@octolinz16 It it...|
|Mon Apr 06 22:20:...|@smarrison i woul...|
|Mon Apr 06 22:20:...|@iamjazzyfizzle I...|
|Mon Apr 06 22:20:...|Hollis' death sce...|
|Mon Apr 06 22:20:...|about to file taxes |
|Mon Apr 06 22:20:...|@LettyA ahh ive a...|
|Mon Apr 06 22:20:...|@FakerPatt

## Data Cleaning

In [14]:
# convert uppercase letters to lowercase
data = data.withColumn("text", lower(col("text")))  
data.show()




+--------------------+--------------------+
|                date|                text|
+--------------------+--------------------+
|Mon Apr 06 22:19:...|@switchfoot http:...|
|Mon Apr 06 22:19:...|is upset that he ...|
|Mon Apr 06 22:19:...|@kenichan i dived...|
|Mon Apr 06 22:19:...|my whole body fee...|
|Mon Apr 06 22:19:...|@nationwideclass ...|
|Mon Apr 06 22:20:...|@kwesidei not the...|
|Mon Apr 06 22:20:...|         need a hug |
|Mon Apr 06 22:20:...|@loltrish hey  lo...|
|Mon Apr 06 22:20:...|@tatiana_k nope t...|
|Mon Apr 06 22:20:...|@twittera que me ...|
|Mon Apr 06 22:20:...|spring break in p...|
|Mon Apr 06 22:20:...|i just re-pierced...|
|Mon Apr 06 22:20:...|@caregiving i cou...|
|Mon Apr 06 22:20:...|@octolinz16 it it...|
|Mon Apr 06 22:20:...|@smarrison i woul...|
|Mon Apr 06 22:20:...|@iamjazzyfizzle i...|
|Mon Apr 06 22:20:...|hollis' death sce...|
|Mon Apr 06 22:20:...|about to file taxes |
|Mon Apr 06 22:20:...|@lettya ahh ive a...|
|Mon Apr 06 22:20:...|@fakerpatt

## Comments;
Uppercase and lowercase letters have different codes. All of them were converted to lowercase letters to prevent confusion. The same expressions will produce similar values.

In [15]:
# removel of special symbols and unnecessary symbols
data = data.withColumn("text", regexp_replace(col("text"), r'[^\w\s]', ''))
# to see all texts
data.show(truncate=False)

+--------------------+--------------------+
|                date|                text|
+--------------------+--------------------+
|Mon Apr 06 22:19:...|switchfoot httptw...|
|Mon Apr 06 22:19:...|is upset that he ...|
|Mon Apr 06 22:19:...|kenichan i dived ...|
|Mon Apr 06 22:19:...|my whole body fee...|
|Mon Apr 06 22:19:...|nationwideclass n...|
|Mon Apr 06 22:20:...|kwesidei not the ...|
|Mon Apr 06 22:20:...|         need a hug |
|Mon Apr 06 22:20:...|loltrish hey  lon...|
|Mon Apr 06 22:20:...|tatiana_k nope th...|
|Mon Apr 06 22:20:...|twittera que me m...|
|Mon Apr 06 22:20:...|spring break in p...|
|Mon Apr 06 22:20:...|i just repierced ...|
|Mon Apr 06 22:20:...|caregiving i coul...|
|Mon Apr 06 22:20:...|octolinz16 it it ...|
|Mon Apr 06 22:20:...|smarrison i would...|
|Mon Apr 06 22:20:...|iamjazzyfizzle i ...|
|Mon Apr 06 22:20:...|hollis death scen...|
|Mon Apr 06 22:20:...|about to file taxes |
|Mon Apr 06 22:20:...|lettya ahh ive al...|
|Mon Apr 06 22:20:...|fakerpatty

## Comments;
Special symbols and unnecessary characters were removed to improve model performance and prevent analysis inconsistency.

In [19]:
# Replace multiple spaces with a single spac
data = data.withColumn("text", regexp_replace(col("text"), r'\s+', ' '))

## Comments;
Replace multiple spece with a simple space to improve model performance and notebook performance

In [20]:
from pyspark.ml.feature import Tokenizer
# Tokinazation
tokenizer = Tokenizer(inputCol="text", outputCol="words")
data = tokenizer.transform(data)

# show the results
data.select("text", "words").show()

+--------------------+--------------------+
|                text|               words|
+--------------------+--------------------+
|switchfoot httptw...|[switchfoot, http...|
|is upset that he ...|[is, upset, that,...|
|kenichan i dived ...|[kenichan, i, div...|
|my whole body fee...|[my, whole, body,...|
|nationwideclass n...|[nationwideclass,...|
|kwesidei not the ...|[kwesidei, not, t...|
|         need a hug |      [need, a, hug]|
|loltrish hey long...|[loltrish, hey, l...|
|tatiana_k nope th...|[tatiana_k, nope,...|
|twittera que me m...|[twittera, que, m...|
|spring break in p...|[spring, break, i...|
|i just repierced ...|[i, just, repierc...|
|caregiving i coul...|[caregiving, i, c...|
|octolinz16 it it ...|[octolinz16, it, ...|
|smarrison i would...|[smarrison, i, wo...|
|iamjazzyfizzle i ...|[iamjazzyfizzle, ...|
|hollis death scen...|[hollis, death, s...|
|about to file taxes |[about, to, file,...|
|lettya ahh ive al...|[lettya, ahh, ive...|
|fakerpattypattz o...|[fakerpatt

## Comments;
To separate text data into words and then perform further analysis on those words

In [21]:
from pyspark.ml.feature import StopWordsRemover
# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
data = remover.transform(data)

In [22]:
data.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                date|                text|               words|      filtered_words|
+--------------------+--------------------+--------------------+--------------------+
|Mon Apr 06 22:19:...|switchfoot httptw...|[switchfoot, http...|[switchfoot, http...|
|Mon Apr 06 22:19:...|is upset that he ...|[is, upset, that,...|[upset, cant, upd...|
|Mon Apr 06 22:19:...|kenichan i dived ...|[kenichan, i, div...|[kenichan, dived,...|
|Mon Apr 06 22:19:...|my whole body fee...|[my, whole, body,...|[whole, body, fee...|
|Mon Apr 06 22:19:...|nationwideclass n...|[nationwideclass,...|[nationwideclass,...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



## Comments;
Meaningless words were removed. Both model performance and notebook performance will increase.