# Oracle internship 2024

## Hind MOUTALATTIF


# Data Preparation

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


In [5]:
train = pd.read_csv('train.csv') # importing train and test data
test = pd.read_csv('test.csv')

In [6]:
train.head(10)

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...
5,306713195832307712,Politics,'Dr. Rajan: Fiscal consolidation will create m...
6,306100962337112064,Politics,"FACT: More than 800,000 defense employees will..."
7,305951758759366657,Sports,"'1st Test. Over 39: 0 runs, 1 wkt (M Wade 0, M..."
8,304482567158104065,Sports,Some of Africa's top teams will try and take a...
9,303806584964935680,Sports,'Can you beat the tweet of @RoryGribbell and z...


In [7]:
train.info() ## getting info about our data to see if there is any non available cells

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6525 entries, 0 to 6524
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   TweetId    6525 non-null   int64 
 1   Label      6525 non-null   object
 2   TweetText  6525 non-null   object
dtypes: int64(1), object(2)
memory usage: 153.1+ KB


In [8]:
train['Label'].value_counts() ## here we see that our data is balenced 

Sports      3325
Politics    3200
Name: Label, dtype: int64

In [9]:
# in this cell we try to convert our classes into numerical classes 
train['Label']=train['Label'].map({'Sports':0, 'Politics':1})

In [10]:
train['Label']

0       1
1       1
2       0
3       0
4       0
       ..
6520    1
6521    0
6522    0
6523    0
6524    1
Name: Label, Length: 6525, dtype: int64

# Feature extraction

In [11]:
# we split data
X = train['TweetText']  
y = train['Label']   

In [12]:
"""Here we will convert the text into a numerical format 
using the term frequency-inverse document frequency (Tf-Idf)
to represent each tweet:
"""
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)


In [13]:
X

<6525x16193 sparse matrix of type '<class 'numpy.float64'>'
	with 105723 stored elements in Compressed Sparse Row format>

In [14]:
# here we split our converted data into train and test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Model training

In [15]:
from sklearn.svm import SVC

In [16]:
"""we start our model trainning using support vector machine 
which is more efficient for binary classification"""

svm_model = SVC()
svm_model.fit(X_train, y_train)

SVC()

# Hyper parameter tuning

In [17]:
"""For hyper parameter tuning we will use Grid Search
that give us the best and optimal set of parameters """

from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 10, 100], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(svm_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_


In [78]:
best_params

{'C': 1, 'kernel': 'linear'}

# Performance

In [79]:
from sklearn.metrics import accuracy_score

y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test , y_pred)

In [80]:
accuracy

0.9484167517875383

# Improvements

## to improve our work we will try a set of models:

In [51]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6
Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [81]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm



In [82]:
"""this function will train each model with the data and will calculate
the predictions of test data to give us a final report containg the accuracies of each model"""

def model_training(models):
    for element in models:
        model = models[element]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("the accuracy of "+element+" is: {:.2f}%".format(accuracy_score(y_test , y_pred)*100))
        

In [83]:
our_models = {"Logistic Regression":LogisticRegression(), "Decision Tree":DecisionTreeClassifier(), "Nearest Neighbors": KNeighborsClassifier(), "Random Forest":RandomForestClassifier(), "XGBClassifier": xgb.XGBClassifier(), "SVM":svm.SVC()}

In [84]:
model_training(our_models)

the accuracy of Logistic Regression is: 93.97%
the accuracy of Decision Tree is: 84.32%
the accuracy of Nearest Neighbors is: 92.34%
the accuracy of Random Forest is: 91.01%
the accuracy of XGBClassifier is: 89.99%
the accuracy of SVM is: 94.84%


# the best model is SVM because of its high accuracy

In [86]:
test

Unnamed: 0,TweetId,TweetText
0,306486520121012224,'28. The home side threaten again through Maso...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....
2,289531046037438464,'@Sochi2014 construction along the shores of t...
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...
...,...,...
2605,282023761044189184,'Qualifier 1 and Eliminator games will be play...
2606,303879735006601216,"@reesedward Hi Edward, it's not a #peacekeepin..."
2607,297956846046703616,'Perera was @SunRisersIPL first #IPL purchase ...
2608,304265049537658880,"'#SecKerry: Thanks to Senator @TimKaine, @RepR..."


In [87]:
## we will vectorize test data 
Z=test['TweetText']
test_vectorized = vectorizer.transform(Z)

In [89]:
test_vectorized

<2610x16193 sparse matrix of type '<class 'numpy.float64'>'
	with 30777 stored elements in Compressed Sparse Row format>

In [90]:
# here we get the predicted data of test using the best model
test_prediction=svm_model.predict(test_vectorized)

In [91]:
test_prediction

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

"""next we try to form the dataset of submission that cpntain tweet id and labels 
so first we should convert our numerical data into primary classes :
sports and politics and make a csv file based on this dataframe"""

In [93]:
submission=pd.DataFrame({'TweetId' : test['TweetId'] , 'Label' : test_prediction })

In [94]:
submission

Unnamed: 0,TweetId,Label
0,306486520121012224,0
1,286353402605228032,1
2,289531046037438464,1
3,306451661403062273,1
4,297941800658812928,0
...,...,...
2605,282023761044189184,1
2606,303879735006601216,0
2607,297956846046703616,0
2608,304265049537658880,1


In [95]:
submission['Label'] = submission['Label'].map({0 : 'Sports' , 1 : 'Politics'})

In [96]:
submission

Unnamed: 0,TweetId,Label
0,306486520121012224,Sports
1,286353402605228032,Politics
2,289531046037438464,Politics
3,306451661403062273,Politics
4,297941800658812928,Sports
...,...,...
2605,282023761044189184,Politics
2606,303879735006601216,Sports
2607,297956846046703616,Sports
2608,304265049537658880,Politics


In [97]:
submission.to_csv('submission.csv')