In [1]:
import pandas as pd
df = pd.read_csv('scraping.csv')
df.head()

Unnamed: 0,Comment,Rating
0,From last 5 years my younger brother was using...,5
1,excellent phone camera is very nice and the st...,4
2,I have been using the earlier versions of iPho...,4
3,IMPORTANT NOTICEIf you buy some apple device o...,5
4,"Well, what can I say... iPhone is awesome as e...",5


#### Lower case

In [2]:
df['Comment'] = df['Comment'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['Comment'].head()

0    from last 5 years my younger brother was using...
1    excellent phone camera is very nice and the st...
2    i have been using the earlier versions of ipho...
3    important noticeif you buy some apple device o...
4    well, what can i say... iphone is awesome as e...
Name: Comment, dtype: object

#### Removing Punctuation

In [3]:
df['Comment'] = df['Comment'].str.replace('[^\w\s]','')
df['Comment'].head()

  df['Comment'] = df['Comment'].str.replace('[^\w\s]','')


0    from last 5 years my younger brother was using...
1    excellent phone camera is very nice and the st...
2    i have been using the earlier versions of ipho...
3    important noticeif you buy some apple device o...
4    well what can i say iphone is awesome as expec...
Name: Comment, dtype: object

#### Removal of Stop Words

In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['Comment'] = df['Comment'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['Comment'].head()

0    last 5 years younger brother using iphone 4s b...
1    excellent phone camera nice stereo speakers fa...
2    using earlier versions iphones therefore great...
3    important noticeif buy apple device online spe...
4    well say iphone awesome expected added prompt ...
Name: Comment, dtype: object

#### Common word removal

In [5]:
freq = pd.Series(' '.join(df['Comment']).split()).value_counts()[:10]
freq

phone          728
good           543
camera         390
iphone         248
flipkart       234
delivery       227
really         217
battery        217
awesome        215
performance    190
dtype: int64

In [6]:
freq = list(freq.index)
df['Comment'] = df['Comment'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['Comment'].head()

0    last 5 years younger brother using 4s bought 7...
1    excellent nice stereo speakers fantastic gives...
2    using earlier versions iphones therefore great...
3    important noticeif buy apple device online spe...
4    well say expected added prompt great packaging...
Name: Comment, dtype: object

#### Rare words removal

In [7]:
freq = pd.Series(' '.join(df['Comment']).split()).value_counts()[-10:]
freq

cameraall    1
1000         1
irregular    1
trip         1
needed       1
mobileits    1
modes2       1
warm         1
ratio        1
looking2     1
dtype: int64

In [8]:
freq = list(freq.index)
df['Comment'] = df['Comment'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['Comment'].head()

0    last 5 years younger brother using 4s bought 7...
1    excellent nice stereo speakers fantastic gives...
2    using earlier versions iphones therefore great...
3    important noticeif buy apple device online spe...
4    well say expected added prompt great packaging...
Name: Comment, dtype: object

#### Spelling correction

In [9]:
from textblob import TextBlob
df['Comment'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    last 5 years younger brother using is bought 7...
1    excellent nice sterno speakers fantastic gives...
2    using earlier versions phone therefore great c...
3    important noticed buy apple device online spec...
4    well say expected added prompt great packing t...
Name: Comment, dtype: object

#### Tokenization

In [10]:
TextBlob(df['Comment'][1]).words

WordList(['excellent', 'nice', 'stereo', 'speakers', 'fantastic', 'gives', 'cinematic', 'experience', 'cool', 'retaina', 'display', 'blindly', 'go', 'value', 'money'])

#### Stemming

In [11]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['Comment'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    last 5 year younger brother use 4s bought 7 bi...
1    excel nice stereo speaker fantast give cinemat...
2    use earlier version iphon therefor great chang...
3    import noticeif buy appl devic onlin specif ph...
4    well say expect ad prompt great packag truli s...
Name: Comment, dtype: object

#### Lemmatization

In [12]:
from textblob import Word
df['Comment'] = df['Comment'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['Comment'].head()

0    last 5 year younger brother using 4 bought 7 b...
1    excellent nice stereo speaker fantastic give c...
2    using earlier version iphones therefore great ...
3    important noticeif buy apple device online spe...
4    well say expected added prompt great packaging...
Name: Comment, dtype: object