# **`nlp basics`**

In [None]:
target_string = "Emma is a basketball player who was born on June 17, 1993. She played 112 matches with scoring average 26.12 points per game. Her weight is 51 kg."

In [None]:
target_string

'Emma is a basketball player who was born on June 17, 1993. She played 112 matches with scoring average 26.12 points per game. Her weight is 51 kg.'

In [None]:
import re 


In [None]:
re.findall(r"\d+",target_string)

['17', '1993', '112', '26', '12', '51']

In [None]:
re.findall(r"\d+",target_string)

['17', '1993', '112', '26', '12', '51']

In [None]:
re.search(r"\d+",target_string).

<re.Match object; span=(49, 51), match='17'>

In [None]:
re.finditer(r"\d+",target_string)

<callable_iterator at 0x7f87471a6910>

In [None]:
target_string = "Jessa is a Python developer. She also gives Python programming training"

In [None]:
re.findall(r'\b[Pp]\w+',target_string)

['Python', 'Python', 'programming']

In [None]:
re.findall(r'\b[Pp]\w+[Gg]',target_string)

['programming']

In [None]:
re.findall(r'\b[Ppt]\w+[g]',target_string)

['programming', 'training']

In [None]:
re.findall(r'\b[Aa]\w+|\w+[aA]\b',target_string)

['Jessa', 'also']

In [None]:
ss="1234567890"
re.findall(r"\d{10}",ss)

['1234567890']

In [None]:
ss="(123)-567-8912"
re.findall(r"\(\d{3}\)-\d{3}-\d{4}",ss)

['(123)-567-8912']

In [None]:
ss="+91 1234567890"
re.findall(r"\+91\s\d{10}",ss)

['+91 1234567890']

In [None]:
st="Jhohn has 3 dogs"
info=re.findall(r'([A-za-z])+ (\w+) (\d) (\w+)',st)

In [None]:
list(zip(*info))

[('n',), ('has',), ('3',), ('dogs',)]

In [None]:
target_string = "My name is maximums and my luck numbers are 12 45 78"
re.split(r'\s+',target_string)

['My',
 'name',
 'is',
 'maximums',
 'and',
 'my',
 'luck',
 'numbers',
 'are',
 '12',
 '45',
 '78']

In [None]:
target_string = "12, and45,78and85-17and89-97"
re.findall(r'\d+',target_string)

['12', '45', '78', '85', '17', '89', '97']

In [None]:
re.split(r'[\b\D\b]+',target_string)

['12', '45', '78', '85', '17', '89', '97']

In [None]:
tt="12-45-74."
re.split(r'(\D+)',tt)

['12', '-', '45', '-', '74', '.', '']

In [None]:
tt="      Jessa know Ml and testinf"
re.sub(r'\s','',tt)

'JessaknowMlandtestinf'

In [None]:
tt="      Jessa know Ml and testinf"
re.sub(r'\s','_',tt,count=5,flags=re.I)

'_____ Jessa know Ml and testinf'

In [None]:
tt="      Jessa know Ml and      testinf"
re.sub('^\s+'," ",tt)

'_Jessa_know_Ml_and      testing'

In [None]:
string='''Follow our leader Elon musk on twitter here:
 https://twitter.com/elonmusk, more information on Tesla's 
 products can be found at https://www.tesla.com/. 
 Also here are leading influencers for tesla related news,
 https://twitter.com/teslaratihttps://twitter.com/dummy_teslahttps://twitter.com/dummy_2_tesla'''

In [None]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [None]:
doc=nlp("you only live once,Heloo world. I am going to N.Y don't.")
for i in doc:
  print(i.text,end=",")

you,only,live,once,,,Heloo,world,.,I,am,going,to,N.Y,do,n't,.,

In [None]:
#blank tokenizer

In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp=English()

tokenizer=Tokenizer(nlp.vocab)
tokens=tokenizer("you only live once,Heloo world. I am going to N.Y don't.")
print("Blank tokenizer",end=":")
for i in tokens:
  print(i,end=",")

Blank tokenizer:you,only,live,once,Heloo,world.,I,am,going,to,N.Y,don't.,

In [None]:
from spacy.symbols import ORTH
doc=nlp("Viratkohli is the best ,Viratkholi is a bats man,Viratkholi is the captain")
print("Noramal",end=":")
for i in doc:
  print(i,end=',')

special=[{ORTH:"Virat"},{ORTH:'kohli'}]
nlp.tokenizer.add_special_case("Viratkohli",special)
doc1=nlp("Viratkohli is the best,Viratkohli is a bats man,Viratkohli is the captain")
print("\n Special",end=":")
for i in doc1:
  print(i,end=",")

Noramal:Virat,kohli,is,the,best,,,Virat,kholi,is,a,bats,man,,,Viratkholi,is,the,captain,
 Special:Virat,kohli,is,the,best,,,Viratkohli,is,a,bats,man,,,Viratkohli,is,the,captain,

In [None]:
special=[{ORTH:"Give"},{ORTH:"me"}]
nlp.tokenizer.add_special_case("Gimme",special)
doc=nlp("Gimme that")
for i in doc:
  print(i)


Give
me
that


In [None]:
nlp=English()
text="Let's move to L.A"
doc=nlp(text)
tok=nlp.tokenizer.explain(text)

for i in tok:
  print(i[1],"\t",i[0])

Let 	 SPECIAL-1
's 	 SPECIAL-2
move 	 TOKEN
to 	 TOKEN
L.A 	 TOKEN


In [None]:
doc=nlp("hai ola 5km ride cost $10.000 lakdj@sss.com")
for i in doc:
  print(i)

hai
ola
5
km
ride
cost
$
10.000
lakdj@sss.com


In [None]:
len(doc)

9

In [None]:
doc[-4:]

cost $10.000 lakdj@sss.com

In [None]:
doc=nlp(u'I am disco dancer')
for i in doc.ents:
  print(i.text+'-'+i.label_+'-'+str(spacy.explain(i.label_)))

disco dancer-PERSON-People, including fictional


In [None]:
doc=nlp("orange is good for health")
for i in doc.ents:
  print(i.text+'-'+i.label_+'-'+str(spacy.explain(i.label_)))

orange-ORG-Companies, agencies, institutions, etc.


In [None]:
doc=nlp(u"autonomous cars shift insuarance liabilities towards managers")
for chuck in doc.noun_chunks:
  print(chuck.text)

autonomous cars
insuarance liabilities
managers


In [None]:
from spacy import displacy
displacy.render(doc,style='dep',jupyter=True,options={display:110})

In [None]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
text="I like bananabread breadbananabread"
print([t.text for t in nlp(text)])

['I', 'like', 'banana', 'bread', 'breadbananabread']


In [None]:
prefix=("banana",)+nlp.Defaults.prefixes

In [None]:
prefix_regex=spacy.util.compile_prefix_regex(prefix)

In [None]:
nlp.tokenizer.prefix_search=prefix_regex.search

In [None]:
print([t.text for t in nlp(text)])

['I', 'like', 'banana', 'bread', 'breadbanana']


In [None]:
suffixes=("banana",)+nlp.Defaults.suffixes
suffixes_regex=spacy.util.compile_suffix_regex(prefix)
nlp.tokenizer.suffix_search=suffixes_regex.search
print([t.text for t in nlp(text)])

['I', 'like', 'banana', 'bread', 'bread', 'banana']


In [None]:
suffixes=("banana",)+nlp.Defaults.infixes
suffixes_regex=spacy.util.compile_infix_regex(suffixes)
nlp.tokenizer.infix_finditer=suffixes_regex.finditer
print([t.text for t in nlp(text)])

['I', 'like', 'banana', 'bread', 'bread', 'banana', 'bread']


In [None]:
from nltk.stem import PorterStemmer ,SnowballStemmer ,LancasterStemmer,RegexpStemmer
stem=PorterStemmer()
list=['generous','generate','generously','generation']
print("PorterStemmer")
for i in list:
  print(stem.stem(i),end=',')

snow=SnowballStemmer(language='english')
print("\nSnowball Stemmer")
for i in list:
  print(snow.stem(i),end=',')


PorterStemmer
gener,gener,gener,gener,
Snowball Stemmer
generous,generat,generous,generat,

In [None]:
lancaster=LancasterStemmer()
words=['eat','ate','eaten','eating']
print("LancaseterStemmer")
for i in words:
  print(lancaster.stem(i),end=',')


print("\nSnowball Stemmer")

for i in words:
  print(snow.stem(i),end=',')

print("\nPorterStemmer")
for i in words:
  print(stem.stem(i),end=',')

print("\nRegexStemmer")
regex=RegexpStemmer('ing$|e$\able$|ed$')
wording=['advisable','counting','hosted']
for i in wording:
  print(regex.stem(i),end=',')

LancaseterStemmer
eat,at,eat,eat,
Snowball Stemmer
eat,ate,eaten,eat,
PorterStemmer
eat,ate,eaten,eat,
RegexStemmer
advisable,count,host,

# **`Gensim`**

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
text='Nick likes to play football,however he is not too font of tennis '

In [None]:
filtered_sentence=remove_stopwords(text)

In [None]:
filtered_sentence

'Nick likes play football,however font tennis'

In [None]:
import gensim
all_stop=gensim.parsing.preprocessing.STOPWORDS

In [None]:
len(all_stop)

337

In [None]:
from gensim.parsing.preprocessing import STOPWORDS
all_stop_gensim=STOPWORDS.union(set(['likes,play']))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
test_tokenize=word_tokenize(text)
test_tokenize=[word for word in test_tokenize if not word in all_stop_gensim]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
test_tokenize

['Nick', 'likes', 'play', 'football', ',', 'font', 'tennis']

In [None]:
alll=STOPWORDS
sw_list={'not'}
diff=STOPWORDS.difference(sw_list)
test_tokenize=word_tokenize(text)
test_tokenize=[word for word in test_tokenize if not word in all_stop_gensim]

In [None]:
test_tokenize

['Nick', 'likes', 'play', 'football', ',', 'font', 'tennis']

# Lower CASE

In [None]:
def text_lower(tt):
  low=tt.lower()
  return(low)

text="I AM WONDERFULL AND INTELLIGENT"
print(text_lower(text))

i am wonderfull and intelligent


# Remove HTML PATTERN




In [None]:
import re
def remove_html(text):
  html_pattern=r'<.*?>'
  tt=re.sub(pattern=html_pattern,repl=' ',string=text)
  return(tt)
text="""<br /><br />the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />"""
remove_html(text)

'  the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece.   '

In [None]:
from bs4 import BeautifulSoup
def remove_html_bs(text):
  parser=BeautifulSoup(text,"html.parser")
  without_html=parser.get_text(separator=" ")
  return without_html

remove_html_bs(text)

'\n Home Events Circulars Contacts \n'

# Remove Number 

In [None]:
def remove_number(text):
  tt=re.sub(r'\d+',repl="",string=text)
  return tt
text="1234hey i344 5a56m super456789man"
remove_number(text)

'hey i am superman'

# Url

In [None]:
def url_removal(text):
  url_pattern=r'https?://\S+|www\.  |S+'
  rr=re.sub(pattern=url_pattern,repl='',string=text)
  return rr
text="example:https://colab.research.google.com/drive/1z9rsSWfgGpUwN11SYgpiW5Znu__YkLIV#scrollTo=OxCdaku_6das"
url_removal(text)

'example:'

# num to words

In [None]:
pip install num2words



In [None]:
from num2words import num2words
def num_to_word(text):

  from num2words import num2words

  after_split = text.split()

  for idx in range(len(after_split)):

    if after_split[idx].isdigit():

      after_split[idx] = num2words(after_split[idx])

  return ' '.join(after_split)
example="This an example 1"
print(num_to_word(example))

This an example one


In [None]:
def remove_special(text):
  return re.sub(r'[^a-zA-Z0-9\s]+', '',text)

# Spell check

In [None]:
pip install spellchecker

Collecting spellchecker
  Downloading spellchecker-0.4.tar.gz (3.9 MB)
[K     |████████████████████████████████| 3.9 MB 14.3 MB/s 
Collecting inexactsearch
  Downloading inexactsearch-1.0.2.tar.gz (21 kB)
Collecting soundex>=1.0
  Downloading soundex-1.1.3.tar.gz (9.1 kB)
Collecting silpa_common>=0.3
  Downloading silpa_common-0.3.tar.gz (9.4 kB)
Building wheels for collected packages: spellchecker, inexactsearch, silpa-common, soundex
  Building wheel for spellchecker (setup.py) ... [?25l[?25hdone
  Created wheel for spellchecker: filename=spellchecker-0.4-py3-none-any.whl size=3966515 sha256=9411b698505d6a2b4108c007c294b016c6eabd9dd1a2c9328752d61cb2929916
  Stored in directory: /root/.cache/pip/wheels/58/e9/48/b82b733a7a0a9cc52ed239ccf082ff33e2fbda71670ddd3349
  Building wheel for inexactsearch (setup.py) ... [?25l[?25hdone
  Created wheel for inexactsearch: filename=inexactsearch-1.0.2-py3-none-any.whl size=7141 sha256=e65bd352fc9c87496e93fa4a4afa82388de6fe633795ab0ebf4d5342f2d

In [None]:
def accented_to_ascii(text):

  import unidecode

  text = unidecode.unidecode(text)

  return text

In [None]:
def spell_autocorrect(text):

  from autocorrect import Speller

  from nltk import word_tokenize

  import nltk

  nltk.download('punkt')

  spell = Speller(lang='en')

  correct_words = []

  for word in word_tokenize(text):

    correct_word = spell(word)

    correct_words.append(correct_word)

  return " ".join(correct_words)

In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/Datasets/IMDB Dataset.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['review']=df['review'].apply(text_lower)
df['review']=df['review'].apply(remove_html)
df['review']=df['review'].apply(remove_special)
df['review']=df['review'].apply(num_to_word)

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
df['review']=df['review'].apply(remove_stopwords)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df['review']=df['review'].apply(sent_tokenize)
df['review']=df['review'].apply(word_tokenize)

In [None]:
df['review'][1]

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv",encoding="ISO-8859-1",usecols=['v1','v2'])

In [None]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize as st
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
corpus=[]
wordnet=WordNetLemmatizer()
#lenght=len(df)
for i in range(0,5572):
  rev=re.sub(r'[^A-Za-z]',' ',df['v2'][i])
  rev=rev.lower()
  rev=rev.split()
  rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
  rev=' '.join(rev)
  corpus.append(rev)

In [None]:
corpus[1]

'ok lar joking wif u oni'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
x=cv.fit_transform(corpus)

In [None]:
x=x.toarray()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(df['v1'])

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier(random_state=1)
model.fit(x_train,y_train)

DecisionTreeClassifier(random_state=1)

In [None]:
y_pred=model.predict(x_test,y_test)
y_pred

In [None]:
model.score(x_test,y_test)

0.9730941704035875

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
cm=confusion_matrix(y_test,y_pred)

In [None]:
cm

In [None]:
df=pd.read_csv('https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv')


In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
df.shape

(50000, 2)

In [None]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [None]:
corpus=[]
wordnet=WordNetLemmatizer()
#lenght=len(df)
for i in range(0,50000):
  CLEANR = re.compile('<.*?>') 
  re.sub(CLEANR, '', df['review'][i])
  rev=re.sub(r'[^A-Za-z]',' ',rev)
  rev=rev.lower()
  rev=rev.split()
  rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
  rev=' '.join(rev)
  corpus.append(rev)

In [None]:
corpus[1]

'rofl true name'

# **Twitter Sentiment Analysis**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re


from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
train=pd.read_csv("/content/drive/MyDrive/Datasets/train_E6oV3lV.csv/train_E6oV3lV.csv")
test=pd.read_csv('/content/drive/MyDrive/Datasets/test_tweets_anuFYb8.csv')

In [None]:
submission=pd.read_csv('/content/drive/MyDrive/Datasets/sample_submission_gfvA5FD.csv')

In [None]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
def preprocessing(text):
  text=text.lower()
  text=re.sub(r"\d+",'',text)
  text=re.sub(r"@"," ",text)
  text=re.sub(r"#","",text)
  text=re.sub(r"[0-9A-Za-z]")
  print(text)

In [None]:
preprocessing("@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked")

In [None]:
train['tweet'].apply(preprocessing)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
nofilter family amazing   finally haha ðð drink sheffield 
colorful painting that gives u   feeling of a lazy&amp; lovely summer sunday sunset  sundayfunday 
like this litle litle never saw in my life   lovebabies smallgirl smallfingers holidayinlithuania â feeling loved
 â us: firmer underlying trends in the economy to reasse themselves â rbc cm   blog silver gold forex
throwback thursday liverpool thecavernclub and  user tourist tshi     theâ¦ 
vehicle   gorilla simulator: you need to do to adapt to the environment. the need to tear the city. materia 
i want to show the world. down syndrome is beautiful.  downsyndrome t awareness love beautiful   proudmom
 user  user well we didn't have the luxury of playing mexico. we've at least done it once. you've never done it. ever. dosacero  
loveofmylife fiance   bestfriends love adamson ð 
sydney vividsydney  user blessed beauty beautiful happiness   lights love

0        None
1        None
2        None
3        None
4        None
         ... 
31957    None
31958    None
31959    None
31960    None
31961    None
Name: tweet, Length: 31962, dtype: object

In [None]:
df.shape

(31962, 3)

In [None]:
stop_words = set(stopwords.words('english'))
stop = [x.lower() for x in stop_words]
lemma = WordNetLemmatizer()

In [None]:
shortcuts = {'u': 'you', 'y': 'why', 'r': 'are', 'doin': 'doing', 'hw': 'how', 'k': 'okay', 'm': 'am', 'b4': 'before',
'idc': "i do not care", 'ty': 'thankyou', 'wlcm': 'welcome', 'bc': 'because', '<3': 'love', 'xoxo': 'love',
'ttyl': 'talk to you later', 'gr8': 'great', 'bday': 'birthday', 'awsm': 'awesome', 'gud': 'good', 'h8': 'hate',
'lv': 'love', 'dm': 'direct message', 'rt': 'retweet', 'wtf': 'hate', 'idgaf': 'hate',
'irl': 'in real life', 'yolo': 'you only live once'}

In [None]:
def clean(text):
  text = text.lower()
  # keep alphanumeric characters only
  text = re.sub('\W+', ' ', text).strip()
  text = text.replace('user', '')
  # tokenize
  text_token = word_tokenize(text)
  # replace shortcuts using dict
  full_words = []
  for token in text_token:
    if token in shortcuts.keys():
      token = shortcuts[token]
    full_words.append(token)
  words_alpha = [re.sub(r'\d+', '', word) for word in full_words]
  words_big = [word for word in words_alpha if len(word)>2]
  stemmed_words = [lemma.lemmatize(word) for word in words_big]
      # join list elements to string
  clean_text = " ".join(stemmed_words)
  clean_text = clean_text.replace(' ', ' ')
  clean_text = clean_text.replace(' ', ' ')
  return clean_text

In [None]:
X_train = train.tweet
y = train.label
X_test = test.tweet

In [None]:
clean_Xtrain = X_train.apply(lambda x: clean(x))

In [None]:
clean_Xtest = X_test.apply(lambda x: clean(x))

In [None]:
vectorizer = CountVectorizer(max_df=0.5)
# vectorizer = TfidfVectorizer(ngram_range=(1,3), max_df=0.5)
X = vectorizer.fit_transform(clean_Xtrain)
X_test = vectorizer.transform(clean_Xtest)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
print(model)

LogisticRegression()


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=0)
# calculate f1 score
model.fit(X_train,y_train)
y_pred = model.predict(X_val)
print('Accuracy:', accuracy_score(y_pred, y_val))
print("F1 Score: ", f1_score(y_pred, y_val))

Accuracy: 0.9627717816361645
F1 Score:  0.6404833836858006


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
df = pd.DataFrame()
df['y_pred'] = y_pred
df['y_pred'].value_counts()

0    6139
1     254
Name: y_pred, dtype: int64

In [None]:
model.fit(X, y)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
submission['label'] = y_pred
submission.to_csv('/content', index=False)

IsADirectoryError: ignored

In [None]:
 str = "How are 3 you : . How !; 12 is everything" 
 matches = re.sub(r'\d+','',str) 
 matches

'How are  you : . How !;  is everything'

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_word(text):

   word_tokens = word_tokenize(text)

   lemmas = [lemmatizer.lemmatize(word,pos ='v')  for word in word_tokens]

   return lemmas

text = 'data science uses scientific methods algorithms and many types of processes'

lemmatize_word(text)

['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithms',
 'and',
 'many',
 'type',
 'of',
 'process']

# **Deep learning**

In [None]:
from keras.preprocessing.text import Tokenizer

t=Tokenizer()

sent=["I am a dico dancer "," i am kamal g anandh "]

t.fit_on_texts(sent)

In [None]:
t.document_count

2

In [None]:
#repeating words
t.word_counts

OrderedDict([('i', 2),
             ('am', 2),
             ('a', 1),
             ('dico', 1),
             ('dancer', 1),
             ('kamal', 1),
             ('g', 1),
             ('anandh', 1)])

In [None]:
t.word_index

{'a': 3,
 'am': 2,
 'anandh': 8,
 'dancer': 5,
 'dico': 4,
 'g': 7,
 'i': 1,
 'kamal': 6}

In [None]:
t.word_docs

defaultdict(int,
            {'a': 1,
             'am': 2,
             'anandh': 1,
             'dancer': 1,
             'dico': 1,
             'g': 1,
             'i': 2,
             'kamal': 1})

In [None]:
st="machine Learning"

seq=t.texts_to_sequences(st)
seq

[[], [3], [], [], [1], [], [], [], [], [], [3], [], [], [1], [], [7]]

In [None]:
#text_to_sequemces
t.fit_on_texts(sent)

sequences= t.texts_to_sequences(sent)

print("sequences generated:",sequences)

sequences generated: [[1, 2, 3, 4, 5], [1, 2, 6, 7, 8]]


In [None]:
doc=["marvellous and machine learning ","amazing  machine learning in artificial intelligence","deep learning subset of machine learning ","Deep learning's computer vision","marvellous natural language processing"]

In [None]:
t.fit_on_texts(doc)
encoded_docs=t.texts_to_matrix(doc,mode='binary')
encoded_docs

array([[0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [None]:
encoded_docs=t.texts_to_matrix(doc,mode='count')
encoded_docs

array([[0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 2., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [None]:
encoded_docs=t.texts_to_matrix(doc,mode='tfidf')
encoded_docs

array([[0.        , 0.        , 0.        , 1.178655  , 1.178655  ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.38629436, 0.        , 1.70474809, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 1.178655  , 1.178655  ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.70474809,
        1.70474809, 1.70474809, 1.70474809, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 1.99563638, 1.178655  ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.38629436, 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.70474809, 1.70474809,
      

In [None]:
encoded_docs=t.texts_to_matrix(doc,mode='freq')
encoded_docs

array([[0.        , 0.        , 0.        , 0.25      , 0.25      ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.25      , 0.        , 0.25      , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.16666667, 0.16666667,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.33333333, 0.16666667,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.16666667, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.16666667, 0.16666667,
      

In [None]:
#dataset 

In [None]:
from keras.datasets import reuters 
import tensorflow as tf
import keras 

In [None]:
(x_train,y_train),(x_test,y_test)=reuters.load_data(num_words=None ,test_split=0.2)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz


In [None]:
print(x_train.shape)
print(y_train.shape)

(8982,)
(8982,)


In [None]:
y_train

array([ 3,  4,  3, ..., 25,  3, 25])

In [None]:
import numpy as np
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45])

In [None]:
num_classes=max(y_train)+1
print("No of classes :",num_classes)

No of classes : 46


In [None]:
import tensorflow as tf
import keras

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_words=5000

In [None]:
tokenizer=Tokenizer(num_words=max_words)
x_train=tokenizer.sequences_to_matrix(x_train,mode="binary")
x_test=tokenizer.sequences_to_matrix(x_test,mode="binary")

In [None]:
y_train=tf.keras.utils.to_categorical(y_train,num_classes=46)
y_test=tf.keras.utils.to_categorical(y_test,num_classes=46)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation

In [None]:
model=Sequential()
model.add(Dense(512,input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=["Accuracy"])


In [None]:
batch_size=32
epochs=10

In [None]:
history=model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,verbose=1,validation_split=0.1)
score=model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#tfidf mode

In [None]:

import pandas as pd
train=pd.read_csv("/content/drive/MyDrive/Datasets/train_E6oV3lV.csv/train_E6oV3lV.csv")
test=pd.read_csv('/content/drive/MyDrive/Datasets/test_tweets_anuFYb8.csv')

In [None]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
x=np.array(train['tweet'])
y=np.array(train['label'])

In [None]:
np.unique(y)

array([0, 1])

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=02)

In [None]:
x_train[223]

"special delivery in the mail today! we're in business y'all! #businesscards #vistaprint   "

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
t=Tokenizer()
x_train=t.texts_to_sequences(x_train)

In [None]:
x_test=t.texts_to_sequences(x_train)

In [None]:
x_test

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],


In [None]:
tokenizer.fit_on_sequences()

In [None]:
tokenizer=Tokenizer(num_words=max_words)
x_train=tokenizer.sequences_to_matrix(x,mode="binary")
x_test=tokenizer.sequences_to_matrix(y,mode="binary")

TypeError: ignored

text classification using snn

In [None]:
!unzip("/content/train.csv.zip")

/bin/bash: -c: line 0: syntax error near unexpected token `"/content/train.csv.zip"'
/bin/bash: -c: line 0: `unzip("/content/train.csv.zip")'


In [None]:
import zipfile

In [None]:
zip_ref = zipfile.ZipFile("/content/train.csv.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()

In [None]:
import pandas as pd
df=pd.read_csv("/content/train.csv")
df.head() 

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
df.shape

(27481, 4)

In [None]:
import re
import shutil
import string
import tensorflow as tf
import os 
from tf.keras import regularisation
from tf.keras import layers
from tf.keras import losses
from collections import  Counter


In [None]:
def remove_url_special(text):
  text=(re.sub(r'[^A-Za-z]',' ',text))
  return text

In [None]:
remove_url_special("I`d have responded, if I were going 	")

'I d have responded  if I were going  '

In [None]:
x=df['text'].apply(remove_url_special)

TypeError: ignored

In [None]:
df['text']

0                      I`d have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
27480       All this flirting going on - The ATG smiles...
Name: text, Length: 27481, dtype: object