In [2]:
#Import libraries and data

import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')

df_test= pd.read_csv('test.csv')
df_train= pd.read_csv('train.csv')


In [3]:
#Dataframe visualisation
df_train.head()

Unnamed: 0,train_idx,title,description,target,creator_name,creator_location,supporters,created_date,banner_image,full_content,victory_flag
0,0,Stop Iran Regime from Chairing U​.​N. Human Ri...,Rights Council 2023 Social Forum. This year's ...,['António Guterres'],UN Watch,"Geneva, Switzerland",80515,10 May 2023,https://assets.change.org/photos/2/ur/wj/vrUrW...,No Joke: The Islamic Republic of Iran has just...,0
1,1,Save Children In Poverty In Syria,Children in Syria are facing extreme poverty. ...,['UNICEF'],Maialen Alawam,"Watertown, MA, United States",23871,25 Jun 2020,https:https://static.change.org/images/default...,Children in Syria are facing extreme poverty. ...,0
2,2,Save Duke Ellington School Of The Arts,,,It’s Me Tivoni,United States,4051,"Nov. 20, 2022",https://assets.change.org/photos/8/ab/hv/DZaBH...,Save Duke Ellington School Of The ArtsTivoni H...,0
3,3,Mobile operators are creating inequality,"In India, after Reliance Jio (a 4G only mobile...","['Telecom Regulatory Authority of India', 'CEL...",Swastik Raj Chauhan,"Ghaziabad, India",350,"Oct 15, 2017",https://assets.change.org/photos/7/by/in/Kzbyi...,"In India, after Reliance Jio (a 4G only mobile...",0
4,4,Grant Military Burial Honors to Women WWII Pilots,"During World War II, a brave group of women jo...",['U.S. Senate'],Tiffany Miller @tiffbmiller,"Walnut Creek, CA, United States",176092,2 Dec 2015,https://assets.change.org/photos/3/lh/jc/TmLhJ...,"During World War II, a brave group of women jo...",1


In [4]:
# Obtaining the number of null values
number_rows_with_nulls = print(len(df_train.index))
df_train.isna().sum()[df_train.isna().sum() > 0]

1965


description         117
target              117
creator_location     12
full_content         12
dtype: int64

### Data Pre-processing

In [5]:
# Deleting null values
df_train.dropna(inplace=True)
len(df_train.index)

1826

In [7]:
df_train['victory_flag'].value_counts()
#The class distribution is not similar, so it is imbalanced. Decision trees algorithms frequently perform well on imbalanced data, so take it into account

0    1552
1     274
Name: victory_flag, dtype: int64

In [8]:
#Select only the year of the petition for taking into account
df_train['created_date'] = df_train['created_date'].str.replace('-', ' ')
df_train['created_date'] = df_train['created_date'].str.replace(' г.', ' ')
df_train['created_year'] = df_train['created_date'].str.split().str[-1]

df_test['created_date'] = df_test['created_date'].str.replace('-', ' ')
df_test['created_date'] = df_test['created_date'].str.replace(' г.', ' ')
df_test['created_year'] = df_test['created_date'].str.split().str[-1]

In [9]:
#Encode categorical variables
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Transform the columns in the training set
df_train['target_encoded'] = label_encoder.fit_transform(df_train['target'])
df_train['creator_name_encoded'] = label_encoder.fit_transform(df_train['creator_name'])
df_train['creator_location_encoded'] = label_encoder.fit_transform(df_train['creator_location'])


# Transform the columns in the test set
df_test['target_encoded'] = label_encoder.fit_transform(df_test['target'])
df_test['creator_name_encoded'] = label_encoder.fit_transform(df_test['creator_name'])
df_test['creator_location_encoded'] = label_encoder.fit_transform(df_test['creator_location'])

In [10]:
# Select the features (X) and target variable (y)
X_train = df_train[['target_encoded', 'creator_name_encoded', 'creator_location_encoded','created_year', 'supporters']]
y_train = df_train['victory_flag']

X_test = df_test[['target_encoded', 'creator_name_encoded', 'creator_location_encoded','created_year', 'supporters']]


### Model training

In [11]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(dtc.score(X_train, y_train)))

#Checking other algorithms for model training
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'.format(lda.score(X_train, y_train)))

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'.format(gnb.score(X_train, y_train)))

from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'.format(svm.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Logistic regression classifier on training set: 0.85
Accuracy of K-NN classifier on training set: 0.85
Accuracy of LDA classifier on training set: 0.88
Accuracy of GNB classifier on training set: 0.84
Accuracy of SVM classifier on training set: 0.85


### Model prediction

Finally, the Decision Tree algorithm was selected.

In [12]:
# Make predictions on the test data
victory_flag_prediction = dtc.predict(X_test)

# Save the predictions as the target value to predict
y_label = victory_flag_prediction 

# Create a new DataFrame with the predictions
df_victory_flag_prediction = pd.DataFrame({'victory_flag_prediction': victory_flag_prediction})

# Save the predictions to a CSV file
df_victory_flag_prediction.to_csv('victory_flag_prediction.csv', index=False)

#### Save results

In [1]:
# Create a new dataframe to store the predictions
df_predictions = pd.DataFrame({'test_idx': df_test['test_idx'],'victory_flag_prediction': victory_flag_prediction})

# Save the predictions to a CSV file
df_predictions.to_csv('predictions.csv', index=False)

df_predictions.to_json(r'C:\Users\louma\anaconda3\envs\International FemHack Hackathon\predictions.json')

NameError: name 'pd' is not defined