In [2]:
!pip install kagglehub # Downloading package for connecting with kaggle



In [17]:
# importing libraries for data reading, preprocessing
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [5]:
# importig packages for dataset reading
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("atharvasoundankar/impact-of-ai-on-digital-media-2020-2025")

# Files examination in following path
print("Files in dataset folder:")
for f in os.listdir(path):
    print(f)

# Making the full path to exact .csv file with dataset
csv_path = os.path.join(path, "Global_AI_Content_Impact_Dataset.csv")

# Reading .csv file and representation in DataFrame
df = pd.read_csv(csv_path)

df.columns

Files in dataset folder:
Global_AI_Content_Impact_Dataset.csv


Index(['Country', 'Year', 'Industry', 'AI Adoption Rate (%)',
       'AI-Generated Content Volume (TBs per year)', 'Job Loss Due to AI (%)',
       'Revenue Increase Due to AI (%)', 'Human-AI Collaboration Rate (%)',
       'Top AI Tools Used', 'Regulation Status', 'Consumer Trust in AI (%)',
       'Market Share of AI Companies (%)'],
      dtype='object')

In [12]:
# Predictive models for nominative feature 'Regulation Status'

# Data preprocessing and importing main classical ML moduls

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


# Columns renaming in more acceptable and convenient way
df = df.rename(columns = {'AI Adoption Rate (%)': 'adoption_rate','AI-Generated Content Volume (TBs per year)': 'content_volume', 'Job Loss Due to AI (%)': 'job_loss',
                          'Revenue Increase Due to AI (%)': 'revenue_increase', 'Human-AI Collaboration Rate (%)': 'human_collaboration', 'Top AI Tools Used': 'top_tools',
                          'Regulation Status': 'regulation_status', 'Consumer Trust in AI (%)': 'consumer_trust', 'Market Share of AI Companies (%)': 'market_share',
                          'Country': 'country', 'Year': 'year', 'Industry': 'industry'})


# Encoding categorial feature into numeric to use them in predictive models
for feature in df.select_dtypes(include=['object']):
  encoder = LabelEncoder()
  df[f'{feature}'] = encoder.fit_transform(df[f'{feature}'])

# Features selection
X = df.drop(columns=['regulation_status'])

# Variable selection
y = df['regulation_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Trying different models and calculating their accuracy score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

clf = {
    'Logistic Regression': LogisticRegression(max_iter=500, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

res = {}

# Accuracy_score calculation in different ML_models
for name, i_clf in clf.items():
  i_clf.fit(X_train, y_train)
  pred = i_clf.predict(X_test)

  acc = accuracy_score(y_test, pred)
  res[name] = acc


# Creating resulting DataFrame and sorting values of accuracy score

res_scores = pd.DataFrame.from_dict(res, orient='index', columns=['Accuracy_score'])
res_scores.sort_values(by='Accuracy_score', ascending=False)

Unnamed: 0,Accuracy_score
Random Forest,0.45
Logistic Regression,0.325
KNN,0.275
Decision Tree,0.2
