# Data Cleaning

Importing the Library

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
df = pd.read_excel("clean_tweets_v4_with mentions.xlsx")
df.head()

Unnamed: 0,created_at,user_name,location,fix_location,tweets
0,2022-10-07 20:03:04,SENDYOURNFT,"Los Angeles, CA",los angeles,i wrote an eli5 of every coin on coinbase http...
1,2022-10-07 19:27:05,SENDYOURNFT,"Los Angeles, CA",los angeles,ftx and visa to launch crypto debit card world...
2,2022-10-07 19:13:08,SENDYOURNFT,"Los Angeles, CA",los angeles,sleuth discovers satoshi’s long-lost bitcoin v...
3,2022-10-07 18:55:08,SENDYOURNFT,"Los Angeles, CA",los angeles,victims of crypto and nft fraud can take theft...
4,2022-10-07 18:52:14,SENDYOURNFT,"Los Angeles, CA",los angeles,terra luna executive arrested by south korean ...


Data Pre-Processing and Cleaning

In [None]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 31.8 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=0342e0a26e9f0656045257ad248095f31b35a2afd480b3072df239ff271eb782
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import re #For the regular expression
import string
from langdetect import detect

tagging_regex = re.compile(r"@\S*")
url_pattern = re.compile(r'https?://\S+|www\.\S+')
signature_pattern = re.compile(r"-\S*")
irregular_pattern = re.compile(r"\^\S*")
new_line_pattern = re.compile(r"\n+\S*")


#Removing emoji
def remove_emoji(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

#Removing URL
def remove_url(string):
    return url_pattern.sub(r'', string)

#Removing Signature
def remove_signature(text):
    return signature_pattern.sub(r'', text)

#Remove punctuations
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

# remove tags,emoji,urls,signatures and spacing between words
def clean_message(message):
    message = re.sub(tagging_regex, '', message)
    message = remove_emoji(message)
    message = remove_url(message)
    message = remove_signature(message)
    message = irregular_pattern.sub(r'', message)
    message = new_line_pattern.sub(r'.', message)
    message = remove_punctuation(message)
    message = message.strip()
    message = ' '.join(message.split()) #adjusting to single space
    message = message.lower() # making all tweets to lower case

    if message and len(message) > 15:
        if detect(message) != 'en':
            return ""
    
    return message


In [None]:
df["tweets"]=df["tweets"].apply(str)
df["cleaned_tweets"]=df["tweets"].apply(clean_message)

In [None]:
df1 = df.drop(['tweets'], axis = 1)

In [None]:
# replacing blank places with nan values
df1['cleaned_tweets'].replace('', np.nan, inplace=True)

In [None]:
df2 = df1[df1['cleaned_tweets'].notna()]

In [None]:
#Removing numbers in cleaned-tweets column 
df2[ df2["cleaned_tweets"].str.isdigit()  ] = "NaN"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
df2[df2=="NaN"]=np.nan
df2[df2=="nan"]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df3 = df2[df2['cleaned_tweets'].notna()]

In [None]:
df3.isna().sum()

created_at          0
user_name           0
location          377
fix_location        0
cleaned_tweets      0
dtype: int64

In [None]:
df3['cleaned_tweets'].drop_duplicates().sort_values()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
#Export clean file
df3.to_csv('cleantweets_30oct_2.csv')