In [None]:
pip install twint

Collecting twint
  Downloading twint-2.1.20.tar.gz (31 kB)
Collecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 12.8 MB/s 
[?25hCollecting aiodns
  Downloading aiodns-3.0.0-py3-none-any.whl (5.0 kB)
Collecting cchardet
  Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)
[K     |████████████████████████████████| 263 kB 53.2 MB/s 
[?25hCollecting elasticsearch
  Downloading elasticsearch-7.14.1-py2.py3-none-any.whl (363 kB)
[K     |████████████████████████████████| 363 kB 44.6 MB/s 
Collecting aiohttp_socks
  Downloading aiohttp_socks-0.6.0-py3-none-any.whl (9.2 kB)
Collecting schedule
  Downloading schedule-1.1.0-py2.py3-none-any.whl (10 kB)
Collecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Collecting googletransx
  Downloading googletransx-2.4.2.tar.gz (13 kB)
Collecting pycares>=4.0.0
  Downloading pycares-4.0.0-cp37-cp37m-manylinux2010_x8

In [None]:
#Scraping
import twint
import nest_asyncio
nest_asyncio.apply()

import numpy as np
import pandas as pd
import re

#Data Scraping

The data is gathered from Twitter using Twint API. Below are the code to scrap the data using Twint. 

Steps:
1. Configure the Twint API using twint.config()
2. Provide the keywords in the search parameter. Don't forget to filter the retweets.
3. Configure the language in the Lang parameter.
4. Configure the scraped limit in the Limit parameter.
5. Configure the date using Since and Until parameter.
6. Set c.Pandas to True
7. Run the search.

**Note: Sometimes Twint might got into an error!!**

The keywords used in our search are:
1. alih fungsi lahan pertanian
2. Pertanian digusur
3. lahan pertanian habis
4. konversi lahan sawah
5. sawah jadi jalan tol
6. sawah digusur
7. Keadaan lahan pertanian
8. Kondisi lahan pertanian
9. Kebijakan ketersediaan lahan
10. penyusutan lahan pertanian
11. lahan gersang
12. Ladang pertanian
13. lahan sawah berkurang
14. konversi lahan pertanian
15. alih fungsi lahan sawah


In [None]:
# Configuration
c = twint.Config()
c.Search = "alih fungsi lahan pertanian -filter:retweets"
c.Lang = "id"
c.Limit = 1000
c.Since = "2017-01-01"
c.Until = "2021-07-15"
c.Pandas = True

# Run 
twint.run.Search(c)
Tweets_df = twint.storage.panda.Tweets_df

In [None]:
Tweets_df = pd.read_csv('Tweets_df.csv')

## Remove https:// and Drop Duplicates

After the data has been gathered, remove the duplicates. The tweets that contains a link like "https://" and "https://t.co/" needs also to be removed, since most tweets with link are associated on news. Sentiment analysis are not taking news as the dataset.

In [None]:
#Remove Https://
i = 0

for tweet in Tweets_df['tweet']:
    x = re.search("https?://[A-Za-z0-9./]+", tweet)
    if x:
      Tweets_df['tweet'][i] = np.nan
    i += 1
  
Tweets_df.dropna(inplace = True)

Tweets_df = Tweets_df.reset_index()
Tweets_df.drop(['index'], axis = 1, inplace = True)

In [None]:
#Remove Https://t.co
i = 0

for tweet in Tweets_df['tweet']:
    x = re.search("https?://t.co/[A-Za-z0-9./]+", tweet)
    if x:
      Tweets_df['tweet'][i] = np.nan
    i += 1

Tweets_df.dropna(inplace = True)

Tweets_df = Tweets_df.reset_index()
Tweets_df.drop(['index'], axis = 1, inplace = True)

In [None]:
Tweets_df = Tweets_df.drop_duplicates(subset = ['tweet'])

In [None]:
Tweets_df.shape

(4748, 3)

## Save the Result

Here we have done scraping the tweet with "alih fungsi lahan pertanian as the keyword". Save the result in CSV format. 

In [None]:
Tweets_df.to_csv('alih_fungsi_lahan_pertanian.csv', index = False)

Repeat the process for the other keywords. After that, merge all the results together into one dataset. 

#Data Preprocessing

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Scraped.csv to Scraped.csv


In [None]:
dataset = pd.read_csv('Scraped.csv')

In [None]:
dataset.head()

Unnamed: 0,conversation_id,date,tweet
0,1.41e+18,2021-06-24 8:41:03,"@Arif_kate Di gua udh ga ada sawah, digusur se..."
1,1.41e+18,2021-06-22 1:05:52,teriak² harga properti mahal. akhirnya sawah d...
2,1.41e+18,2021-06-18 8:41:56,@hidayat_387 Gunungnya udah dikeruk. Sawah uda...
3,1.41e+18,2021-06-17 20:55:28,@TaniHitam Semua sawah akan digusur dinggo mba...
4,1.41e+18,2021-06-17 19:41:33,Terpikir jg klo fenomena skrg proyek bangunan ...


In [None]:
dataset = dataset.drop_duplicates()

In [None]:
dataset.shape

(4748, 3)

## Tweet Cleaning

### 1. @mention

In [None]:
i = 0
bersih = []

for tweet in dataset['tweet']:
    bersih.append(re.sub('@[A-Za-z0-9_]+','', tweet))
    i += 1

In [None]:
dataset['tweet'] = bersih

In [None]:
dataset['tweet'] = dataset['tweet'].str.strip()

### 2. Hashtag

In [None]:
i = 0
bersih = []

for tweet in dataset['tweet']:
    bersih.append(re.sub("#[a-zA-Z0-9_-]*", " ", tweet))
    i += 1

dataset['tweet'] = bersih
dataset['tweet'].str.strip()

0       Di gua udh ga ada sawah, digusur sentraland se...
1       teriak² harga properti mahal. akhirnya sawah d...
2       Gunungnya udah dikeruk. Sawah udah jadi pabrik...
3       Semua sawah akan digusur dinggo mbangun dalan ...
4       Terpikir jg klo fenomena skrg proyek bangunan ...
                              ...                        
4743    Nggak punya lahan untuk tanaman, makanya rumah...
4744    memberi latihan dan memberi pengalaman secara ...
4745    Syoknya tgk pakaian cikguÂ² ni. HariÂ² boleh p...
4746    Fungsi tanggul ini mampu melindungi kawasan la...
4747    tingkatkan proses dimana desa yg membutuhkan a...
Name: tweet, Length: 4748, dtype: object

### 3. Emoji

In [None]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [None]:
i = 0
bersih = []

for tweet in dataset['tweet']:
    bersih.append(deEmojify(tweet))
    i += 1

dataset['tweet'] = bersih
dataset['tweet'].str.strip()

0       Di gua udh ga ada sawah, digusur sentraland se...
1       teriak² harga properti mahal. akhirnya sawah d...
2       Gunungnya udah dikeruk. Sawah udah jadi pabrik...
3       Semua sawah akan digusur dinggo mbangun dalan ...
4       Terpikir jg klo fenomena skrg proyek bangunan ...
                              ...                        
4743    Nggak punya lahan untuk tanaman, makanya rumah...
4744    memberi latihan dan memberi pengalaman secara ...
4745    Syoknya tgk pakaian cikguÂ² ni. HariÂ² boleh p...
4746    Fungsi tanggul ini mampu melindungi kawasan la...
4747    tingkatkan proses dimana desa yg membutuhkan a...
Name: tweet, Length: 4748, dtype: object

### 4. NonLetter

In [None]:
i = 0
bersih = []

for tweet in dataset['tweet']:
    bersih.append(re.sub("[^a-zA-Z]", " ", tweet))
    i += 1

dataset['tweet'] = bersih
dataset['tweet'].str.strip()

0       Di gua udh ga ada sawah  digusur sentraland se...
1       teriak  harga properti mahal  akhirnya sawah d...
2       Gunungnya udah dikeruk  Sawah udah jadi pabrik...
3       Semua sawah akan digusur dinggo mbangun dalan ...
4       Terpikir jg klo fenomena skrg proyek bangunan ...
                              ...                        
4743    Nggak punya lahan untuk tanaman  makanya rumah...
4744    memberi latihan dan memberi pengalaman secara ...
4745    Syoknya tgk pakaian cikgu   ni  Hari   boleh p...
4746    Fungsi tanggul ini mampu melindungi kawasan la...
4747    tingkatkan proses dimana desa yg membutuhkan a...
Name: tweet, Length: 4748, dtype: object

### 5. Remove Duplicates

In [None]:
dataset.drop_duplicates(subset = ['tweet'], inplace = True)

In [None]:
dataset.shape

(4614, 3)

## Labelling Sentiment with Inset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving negatif.csv to negatif.csv
Saving positif.csv to positif.csv


In [None]:
pos = pd.read_csv('positif.csv')
neg = pd.read_csv('negatif.csv')

In [None]:
pos.head()

Unnamed: 0,word,weight
0,hai,3
1,merekam,2
2,ekstensif,3
3,paripurna,1
4,detail,2


In [None]:
neg.head()

Unnamed: 0,word,weight
0,putus tali gantung,-2
1,gelebah,-2
2,gobar hati,-2
3,tersentuh (perasaan),-1
4,isak,-5


In [None]:
neg["digusur"] = -5
neg["dibabat"] = -4

In [None]:
p = dict()
i = 0

for w in pos['word']:
    p[w] = pos['weight'][i]
    
n = dict()
i = 0

for w in neg['word']:
    n[w] = neg['weight'][i]

In [None]:
def sentiment_analysis_lexicon_indonesia(text):
    #for word in text:
    score = 0
    for word in text:
        if (word in p):
            score = score + p[word]
    for word in text:
        if (word in n):
            score = score + n[word]
    polarity=''
    if (score > 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else:
        polarity = 'neutral'
    return score, polarity

In [None]:
results = dataset['tweet'].str.split().apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
dataset['polarity_score'] = results[0]
dataset['sentiment'] = results[1]
print(dataset['sentiment'].value_counts())

positive    2414
negative    1722
neutral      478
Name: sentiment, dtype: int64


### Drop the Neutrals

In [None]:
dataset.head()

Unnamed: 0,conversation_id,date,tweet,polarity_score,sentiment
0,1.41e+18,2021-06-24 8:41:03,Di gua udh ga ada sawah digusur sentraland se...,-3,negative
1,1.41e+18,2021-06-22 1:05:52,teriak harga properti mahal akhirnya sawah d...,0,neutral
2,1.41e+18,2021-06-18 8:41:56,Gunungnya udah dikeruk Sawah udah jadi pabrik...,-2,negative
3,1.41e+18,2021-06-17 20:55:28,Semua sawah akan digusur dinggo mbangun dalan ...,-2,negative
4,1.41e+18,2021-06-17 19:41:33,Terpikir jg klo fenomena skrg proyek bangunan ...,3,positive


In [None]:
dataset["sentiment"] = dataset['sentiment'].map({"negative" : 0, "positive" : 1})

In [None]:
dataset.dropna(inplace = True)

In [None]:
dataset = dataset.reset_index()
dataset.drop(['index'], axis = 1, inplace = True)

In [None]:
dataset.shape

(4136, 5)

In [None]:
dataset.drop(['polarity_score'], axis = 1, inplace = True)

In [None]:
dataset.head()

Unnamed: 0,conversation_id,date,tweet,sentiment
0,1.41e+18,2021-06-24 8:41:03,Di gua udh ga ada sawah digusur sentraland se...,0.0
1,1.41e+18,2021-06-18 8:41:56,Gunungnya udah dikeruk Sawah udah jadi pabrik...,0.0
2,1.41e+18,2021-06-17 20:55:28,Semua sawah akan digusur dinggo mbangun dalan ...,0.0
3,1.41e+18,2021-06-17 19:41:33,Terpikir jg klo fenomena skrg proyek bangunan ...,1.0
4,1.41e+18,2021-06-17 15:33:53,sedih rasanya tu sawah digusur buat perumahan ...,1.0


### Save The Result

In [None]:
dataset.to_csv('Cleaned_and_labeled.csv', index = False)

#Transform Dataset

In [None]:
cleaned = pd.read_csv('Cleaned_and_labeled.csv')

In [None]:
target = cleaned['sentiment']

In [None]:
target

0       0.0
1       0.0
2       0.0
3       1.0
4       1.0
       ... 
4131    1.0
4132    1.0
4133    1.0
4134    1.0
4135    0.0
Name: sentiment, Length: 4136, dtype: float64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = cleaned['tweet'].values
vectorizer = TfidfVectorizer() ## Diambil semuanya

X = vectorizer.fit_transform(corpus).toarray()
y = cleaned["sentiment"].values

In [None]:
X.shape

(4136, 14008)

In [None]:
from sklearn.model_selection import train_test_split

train, test, s_train, s_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
train.shape, test.shape, s_train.shape, s_test.shape

((2895, 14008), (1241, 14008), (2895,), (1241,))

#Modelling

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

accuracies = cross_val_score(estimator = LogisticRegression(), X = train, y = s_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 76.13 %
Standard Deviation: 2.22 %


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

logreg = LogisticRegression() 
logreg.fit(train, s_train)
cm = confusion_matrix(logreg.predict(test), s_test)
print(cm)
accuracy_score(logreg.predict(test), s_test)

[[288  60]
 [244 649]]


0.7550362610797744